summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c9
-rw-r--r--net/8021q/vlan.h19
-rw-r--r--net/9p/client.c20
-rw-r--r--net/9p/trans_common.c4
-rw-r--r--net/9p/trans_fd.c6
-rw-r--r--net/9p/trans_rdma.c2
-rw-r--r--net/9p/trans_virtio.c9
-rw-r--r--net/9p/trans_xen.c4
-rw-r--r--net/Kconfig3
-rw-r--r--net/Makefile1
-rw-r--r--net/appletalk/aarp.c18
-rw-r--r--net/appletalk/ddp.c7
-rw-r--r--net/atm/lec.c7
-rw-r--r--net/atm/raw.c12
-rw-r--r--net/atm/signaling.c2
-rw-r--r--net/batman-adv/Kconfig27
-rw-r--r--net/batman-adv/Makefile3
-rw-r--r--net/batman-adv/bat_algo.c34
-rw-r--r--net/batman-adv/bat_algo.h5
-rw-r--r--net/batman-adv/bat_iv_ogm.c230
-rw-r--r--net/batman-adv/bat_v.c247
-rw-r--r--net/batman-adv/bat_v_elp.c2
-rw-r--r--net/batman-adv/bat_v_ogm.c2
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c132
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h16
-rw-r--r--net/batman-adv/debugfs.c442
-rw-r--r--net/batman-adv/debugfs.h73
-rw-r--r--net/batman-adv/distributed-arp-table.c55
-rw-r--r--net/batman-adv/distributed-arp-table.h2
-rw-r--r--net/batman-adv/fragmentation.c31
-rw-r--r--net/batman-adv/gateway_client.c39
-rw-r--r--net/batman-adv/gateway_client.h2
-rw-r--r--net/batman-adv/hard-interface.c57
-rw-r--r--net/batman-adv/hard-interface.h26
-rw-r--r--net/batman-adv/icmp_socket.c392
-rw-r--r--net/batman-adv/icmp_socket.h38
-rw-r--r--net/batman-adv/log.c208
-rw-r--r--net/batman-adv/main.c47
-rw-r--r--net/batman-adv/main.h5
-rw-r--r--net/batman-adv/multicast.c127
-rw-r--r--net/batman-adv/multicast.h3
-rw-r--r--net/batman-adv/netlink.c7
-rw-r--r--net/batman-adv/network-coding.c91
-rw-r--r--net/batman-adv/network-coding.h13
-rw-r--r--net/batman-adv/originator.c121
-rw-r--r--net/batman-adv/originator.h4
-rw-r--r--net/batman-adv/routing.c10
-rw-r--r--net/batman-adv/send.c2
-rw-r--r--net/batman-adv/soft-interface.c141
-rw-r--r--net/batman-adv/soft-interface.h1
-rw-r--r--net/batman-adv/sysfs.c1272
-rw-r--r--net/batman-adv/sysfs.h93
-rw-r--r--net/batman-adv/tp_meter.c1
-rw-r--r--net/batman-adv/translation-table.c212
-rw-r--r--net/batman-adv/translation-table.h3
-rw-r--r--net/batman-adv/types.h70
-rw-r--r--net/bluetooth/Kconfig1
-rw-r--r--net/bluetooth/a2mp.c22
-rw-r--r--net/bluetooth/hci_conn.c14
-rw-r--r--net/bluetooth/hci_core.c96
-rw-r--r--net/bluetooth/hci_debugfs.c50
-rw-r--r--net/bluetooth/hci_event.c133
-rw-r--r--net/bluetooth/hci_request.c364
-rw-r--r--net/bluetooth/hci_request.h2
-rw-r--r--net/bluetooth/hidp/core.c2
-rw-r--r--net/bluetooth/l2cap_core.c17
-rw-r--r--net/bluetooth/l2cap_sock.c21
-rw-r--r--net/bluetooth/mgmt.c493
-rw-r--r--net/bluetooth/mgmt_config.c187
-rw-r--r--net/bluetooth/msft.c3
-rw-r--r--net/bluetooth/sco.c11
-rw-r--r--net/bluetooth/smp.c44
-rw-r--r--net/bluetooth/smp.h2
-rw-r--r--net/bpf/test_run.c89
-rw-r--r--net/bpfilter/Kconfig1
-rw-r--r--net/bridge/Kconfig11
-rw-r--r--net/bridge/Makefile2
-rw-r--r--net/bridge/br.c10
-rw-r--r--net/bridge/br_cfm.c867
-rw-r--r--net/bridge/br_cfm_netlink.c726
-rw-r--r--net/bridge/br_device.c63
-rw-r--r--net/bridge/br_forward.c17
-rw-r--r--net/bridge/br_if.c1
-rw-r--r--net/bridge/br_input.c41
-rw-r--r--net/bridge/br_ioctl.c2
-rw-r--r--net/bridge/br_mdb.c599
-rw-r--r--net/bridge/br_mrp.c68
-rw-r--r--net/bridge/br_mrp_netlink.c2
-rw-r--r--net/bridge/br_mrp_switchdev.c7
-rw-r--r--net/bridge/br_multicast.c1868
-rw-r--r--net/bridge/br_netfilter_hooks.c7
-rw-r--r--net/bridge/br_netlink.c121
-rw-r--r--net/bridge/br_private.h232
-rw-r--r--net/bridge/br_private_cfm.h147
-rw-r--r--net/bridge/br_private_mrp.h34
-rw-r--r--net/bridge/br_vlan.c41
-rw-r--r--net/bridge/netfilter/Kconfig4
-rw-r--r--net/bridge/netfilter/ebt_dnat.c2
-rw-r--r--net/bridge/netfilter/ebt_redirect.c2
-rw-r--r--net/bridge/netfilter/ebt_snat.c2
-rw-r--r--net/bridge/netfilter/ebt_stp.c1
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c255
-rw-r--r--net/caif/cfsrvl.c1
-rw-r--r--net/can/Kconfig15
-rw-r--r--net/can/Makefile3
-rw-r--r--net/can/af_can.c55
-rw-r--r--net/can/bcm.c6
-rw-r--r--net/can/gw.c86
-rw-r--r--net/can/isotp.c1446
-rw-r--r--net/can/j1939/main.c4
-rw-r--r--net/can/j1939/socket.c6
-rw-r--r--net/can/j1939/transport.c2
-rw-r--r--net/can/proc.c18
-rw-r--r--net/can/raw.c34
-rw-r--r--net/ceph/Kconfig3
-rw-r--r--net/ceph/Makefile3
-rw-r--r--net/ceph/auth.c408
-rw-r--r--net/ceph/auth_none.c5
-rw-r--r--net/ceph/auth_x.c309
-rw-r--r--net/ceph/auth_x_protocol.h3
-rw-r--r--net/ceph/ceph_common.c63
-rw-r--r--net/ceph/ceph_strings.c28
-rw-r--r--net/ceph/crypto.c3
-rw-r--r--net/ceph/crypto.h3
-rw-r--r--net/ceph/decode.c101
-rw-r--r--net/ceph/messenger.c1955
-rw-r--r--net/ceph/messenger_v1.c1506
-rw-r--r--net/ceph/messenger_v2.c3459
-rw-r--r--net/ceph/mon_client.c403
-rw-r--r--net/ceph/osd_client.c151
-rw-r--r--net/ceph/osdmap.c211
-rw-r--r--net/compat.c4
-rw-r--r--net/core/bpf_sk_storage.c938
-rw-r--r--net/core/datagram.c47
-rw-r--r--net/core/dev.c424
-rw-r--r--net/core/dev_ioctl.c2
-rw-r--r--net/core/devlink.c980
-rw-r--r--net/core/drop_monitor.c139
-rw-r--r--net/core/fib_rules.c4
-rw-r--r--net/core/filter.c1030
-rw-r--r--net/core/flow_dissector.c12
-rw-r--r--net/core/flow_offload.c4
-rw-r--r--net/core/gen_estimator.c11
-rw-r--r--net/core/gro_cells.c7
-rw-r--r--net/core/lwt_bpf.c12
-rw-r--r--net/core/neighbour.c15
-rw-r--r--net/core/net-procfs.c15
-rw-r--r--net/core/net-sysfs.c75
-rw-r--r--net/core/net_namespace.c18
-rw-r--r--net/core/netclassid_cgroup.c3
-rw-r--r--net/core/netpoll.c24
-rw-r--r--net/core/netprio_cgroup.c3
-rw-r--r--net/core/page_pool.c70
-rw-r--r--net/core/pktgen.c10
-rw-r--r--net/core/ptp_classifier.c30
-rw-r--r--net/core/rtnetlink.c72
-rw-r--r--net/core/skbuff.c178
-rw-r--r--net/core/skmsg.c280
-rw-r--r--net/core/sock.c66
-rw-r--r--net/core/sock_diag.c9
-rw-r--r--net/core/sock_map.c483
-rw-r--r--net/core/sock_reuseport.c2
-rw-r--r--net/core/sysctl_net_core.c17
-rw-r--r--net/core/xdp.c86
-rw-r--r--net/dcb/dcbnl.c18
-rw-r--r--net/dccp/ackvec.c7
-rw-r--r--net/dccp/ccid.c2
-rw-r--r--net/dccp/ccids/ccid2.c5
-rw-r--r--net/dccp/ccids/ccid3.c6
-rw-r--r--net/dccp/ccids/lib/loss_interval.c3
-rw-r--r--net/dccp/ccids/lib/packet_history.c3
-rw-r--r--net/dccp/feat.c6
-rw-r--r--net/dccp/ipv4.c12
-rw-r--r--net/dccp/ipv6.c8
-rw-r--r--net/dccp/output.c9
-rw-r--r--net/dccp/qpolicy.c6
-rw-r--r--net/dccp/timer.c15
-rw-r--r--net/decnet/dn_dev.c2
-rw-r--r--net/decnet/dn_route.c2
-rw-r--r--net/dsa/Kconfig11
-rw-r--r--net/dsa/Makefile4
-rw-r--r--net/dsa/dsa.c58
-rw-r--r--net/dsa/dsa2.c143
-rw-r--r--net/dsa/dsa_priv.h64
-rw-r--r--net/dsa/master.c37
-rw-r--r--net/dsa/port.c104
-rw-r--r--net/dsa/slave.c306
-rw-r--r--net/dsa/switch.c50
-rw-r--r--net/dsa/tag_8021q.c158
-rw-r--r--net/dsa/tag_ar9331.c3
-rw-r--r--net/dsa/tag_brcm.c38
-rw-r--r--net/dsa/tag_dsa.c335
-rw-r--r--net/dsa/tag_edsa.c215
-rw-r--r--net/dsa/tag_gswip.c5
-rw-r--r--net/dsa/tag_hellcreek.c64
-rw-r--r--net/dsa/tag_ksz.c76
-rw-r--r--net/dsa/tag_lan9303.c9
-rw-r--r--net/dsa/tag_mtk.c13
-rw-r--r--net/dsa/tag_ocelot.c61
-rw-r--r--net/dsa/tag_qca.c13
-rw-r--r--net/dsa/tag_rtl4_a.c11
-rw-r--r--net/dsa/tag_sja1105.c33
-rw-r--r--net/dsa/tag_trailer.c32
-rw-r--r--net/ethernet/eth.c6
-rw-r--r--net/ethtool/bitset.c28
-rw-r--r--net/ethtool/cabletest.c41
-rw-r--r--net/ethtool/channels.c43
-rw-r--r--net/ethtool/coalesce.c45
-rw-r--r--net/ethtool/common.c2
-rw-r--r--net/ethtool/debug.c24
-rw-r--r--net/ethtool/eee.c32
-rw-r--r--net/ethtool/features.c32
-rw-r--r--net/ethtool/ioctl.c69
-rw-r--r--net/ethtool/linkinfo.c30
-rw-r--r--net/ethtool/linkmodes.c34
-rw-r--r--net/ethtool/linkstate.c14
-rw-r--r--net/ethtool/netlink.c124
-rw-r--r--net/ethtool/netlink.h35
-rw-r--r--net/ethtool/pause.c86
-rw-r--r--net/ethtool/privflags.c24
-rw-r--r--net/ethtool/rings.c35
-rw-r--r--net/ethtool/strset.c28
-rw-r--r--net/ethtool/tsinfo.c13
-rw-r--r--net/ethtool/tunnels.c42
-rw-r--r--net/ethtool/wol.c24
-rw-r--r--net/hsr/hsr_debugfs.c21
-rw-r--r--net/hsr/hsr_main.h5
-rw-r--r--net/hsr/hsr_netlink.c6
-rw-r--r--net/ieee802154/netlink.c6
-rw-r--r--net/ieee802154/nl-mac.c2
-rw-r--r--net/ipv4/af_inet.c3
-rw-r--r--net/ipv4/arp.c6
-rw-r--r--net/ipv4/bpf_tcp_ca.c37
-rw-r--r--net/ipv4/cipso_ipv4.c2
-rw-r--r--net/ipv4/devinet.c5
-rw-r--r--net/ipv4/esp4.c7
-rw-r--r--net/ipv4/fib_frontend.c6
-rw-r--r--net/ipv4/fib_semantics.c7
-rw-r--r--net/ipv4/fib_trie.c9
-rw-r--r--net/ipv4/fou.c10
-rw-r--r--net/ipv4/gre_demux.c2
-rw-r--r--net/ipv4/icmp.c50
-rw-r--r--net/ipv4/inet_connection_sock.c9
-rw-r--r--net/ipv4/inet_diag.c21
-rw-r--r--net/ipv4/inet_fragment.c47
-rw-r--r--net/ipv4/inet_hashtables.c74
-rw-r--r--net/ipv4/inet_timewait_sock.c4
-rw-r--r--net/ipv4/ip_gre.c21
-rw-r--r--net/ipv4/ip_options.c35
-rw-r--r--net/ipv4/ip_output.c21
-rw-r--r--net/ipv4/ip_sockglue.c5
-rw-r--r--net/ipv4/ip_tunnel.c38
-rw-r--r--net/ipv4/ip_tunnel_core.c34
-rw-r--r--net/ipv4/ip_vti.c11
-rw-r--r--net/ipv4/ipconfig.c14
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/ipmr.c14
-rw-r--r--net/ipv4/metrics.c2
-rw-r--r--net/ipv4/netfilter.c8
-rw-r--r--net/ipv4/netfilter/arp_tables.c16
-rw-r--r--net/ipv4/netfilter/ip_tables.c16
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c3
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c2
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c19
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c134
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c3
-rw-r--r--net/ipv4/nexthop.c317
-rw-r--r--net/ipv4/ping.c31
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/raw.c9
-rw-r--r--net/ipv4/route.c45
-rw-r--r--net/ipv4/syncookies.c17
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp.c653
-rw-r--r--net/ipv4/tcp_bbr.c2
-rw-r--r--net/ipv4/tcp_bpf.c31
-rw-r--r--net/ipv4/tcp_cong.c32
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_input.c302
-rw-r--r--net/ipv4/tcp_ipv4.c87
-rw-r--r--net/ipv4/tcp_lp.c7
-rw-r--r--net/ipv4/tcp_metrics.c8
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c246
-rw-r--r--net/ipv4/tcp_recovery.c24
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/tcp_timer.c55
-rw-r--r--net/ipv4/tcp_vegas.c8
-rw-r--r--net/ipv4/udp.c19
-rw-r--r--net/ipv4/udp_bpf.c9
-rw-r--r--net/ipv4/udp_diag.c2
-rw-r--r--net/ipv4/udp_offload.c93
-rw-r--r--net/ipv4/udp_tunnel_nic.c96
-rw-r--r--net/ipv4/xfrm4_tunnel.c4
-rw-r--r--net/ipv6/addrconf.c12
-rw-r--r--net/ipv6/addrconf_core.c8
-rw-r--r--net/ipv6/addrlabel.c26
-rw-r--r--net/ipv6/af_inet6.c6
-rw-r--r--net/ipv6/ah6.c3
-rw-r--r--net/ipv6/calipso.c6
-rw-r--r--net/ipv6/datagram.c2
-rw-r--r--net/ipv6/esp6.c7
-rw-r--r--net/ipv6/exthdrs.c5
-rw-r--r--net/ipv6/icmp.c25
-rw-r--r--net/ipv6/inet6_connection_sock.c4
-rw-r--r--net/ipv6/inet6_hashtables.c6
-rw-r--r--net/ipv6/ip6_fib.c21
-rw-r--r--net/ipv6/ip6_gre.c55
-rw-r--r--net/ipv6/ip6_output.c47
-rw-r--r--net/ipv6/ip6_tunnel.c51
-rw-r--r--net/ipv6/ip6_vti.c11
-rw-r--r--net/ipv6/ipv6_sockglue.c2
-rw-r--r--net/ipv6/mcast.c2
-rw-r--r--net/ipv6/ndisc.c7
-rw-r--r--net/ipv6/netfilter.c6
-rw-r--r--net/ipv6/netfilter/ip6_tables.c16
-rw-r--r--net/ipv6/netfilter/ip6t_NPT.c39
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c2
-rw-r--r--net/ipv6/netfilter/ip6table_mangle.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c10
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c146
-rw-r--r--net/ipv6/netfilter/nft_reject_ipv6.c3
-rw-r--r--net/ipv6/ping.c2
-rw-r--r--net/ipv6/proc.c2
-rw-r--r--net/ipv6/raw.c4
-rw-r--r--net/ipv6/reassembly.c17
-rw-r--r--net/ipv6/route.c15
-rw-r--r--net/ipv6/rpl.c2
-rw-r--r--net/ipv6/rpl_iptunnel.c9
-rw-r--r--net/ipv6/seg6_hmac.c1
-rw-r--r--net/ipv6/seg6_local.c590
-rw-r--r--net/ipv6/sit.c9
-rw-r--r--net/ipv6/syncookies.c12
-rw-r--r--net/ipv6/tcp_ipv6.c63
-rw-r--r--net/ipv6/udp.c10
-rw-r--r--net/ipv6/udp_offload.c27
-rw-r--r--net/ipv6/xfrm6_tunnel.c4
-rw-r--r--net/iucv/af_iucv.c17
-rw-r--r--net/iucv/iucv.c8
-rw-r--r--net/key/af_key.c6
-rw-r--r--net/l2tp/Makefile2
-rw-r--r--net/l2tp/l2tp_core.c329
-rw-r--r--net/l2tp/l2tp_core.h33
-rw-r--r--net/l2tp/l2tp_debugfs.c4
-rw-r--r--net/l2tp/l2tp_eth.c13
-rw-r--r--net/l2tp/l2tp_ip.c17
-rw-r--r--net/l2tp/l2tp_ip6.c19
-rw-r--r--net/l2tp/l2tp_netlink.c30
-rw-r--r--net/l2tp/l2tp_ppp.c70
-rw-r--r--net/l2tp/trace.h211
-rw-r--r--net/l3mdev/l3mdev.c1
-rw-r--r--net/lapb/lapb_iface.c151
-rw-r--r--net/lapb/lapb_out.c3
-rw-r--r--net/lapb/lapb_timer.c41
-rw-r--r--net/llc/llc_conn.c2
-rw-r--r--net/mac80211/Kconfig2
-rw-r--r--net/mac80211/Makefile1
-rw-r--r--net/mac80211/agg-rx.c10
-rw-r--r--net/mac80211/agg-tx.c12
-rw-r--r--net/mac80211/cfg.c151
-rw-r--r--net/mac80211/chan.c83
-rw-r--r--net/mac80211/debugfs.c47
-rw-r--r--net/mac80211/debugfs_key.c2
-rw-r--r--net/mac80211/debugfs_netdev.c17
-rw-r--r--net/mac80211/debugfs_sta.c4
-rw-r--r--net/mac80211/driver-ops.c5
-rw-r--r--net/mac80211/driver-ops.h29
-rw-r--r--net/mac80211/ibss.c7
-rw-r--r--net/mac80211/ieee80211_i.h72
-rw-r--r--net/mac80211/iface.c1031
-rw-r--r--net/mac80211/key.c64
-rw-r--r--net/mac80211/main.c24
-rw-r--r--net/mac80211/mesh.c36
-rw-r--r--net/mac80211/mesh_hwmp.c4
-rw-r--r--net/mac80211/mesh_pathtbl.c4
-rw-r--r--net/mac80211/mesh_plink.c1
-rw-r--r--net/mac80211/mesh_ps.c6
-rw-r--r--net/mac80211/mlme.c359
-rw-r--r--net/mac80211/offchannel.c40
-rw-r--r--net/mac80211/pm.c15
-rw-r--r--net/mac80211/rate.c43
-rw-r--r--net/mac80211/rc80211_minstrel.c27
-rw-r--r--net/mac80211/rc80211_minstrel.h1
-rw-r--r--net/mac80211/rx.c139
-rw-r--r--net/mac80211/s1g.c16
-rw-r--r--net/mac80211/scan.c43
-rw-r--r--net/mac80211/spectmgmt.c10
-rw-r--r--net/mac80211/sta_info.c36
-rw-r--r--net/mac80211/sta_info.h12
-rw-r--r--net/mac80211/status.c247
-rw-r--r--net/mac80211/trace.h56
-rw-r--r--net/mac80211/tx.c362
-rw-r--r--net/mac80211/util.c266
-rw-r--r--net/mac80211/vht.c18
-rw-r--r--net/mac80211/wme.c18
-rw-r--r--net/mac802154/main.c8
-rw-r--r--net/mpls/af_mpls.c2
-rw-r--r--net/mpls/mpls_iptunnel.c1
-rw-r--r--net/mptcp/Kconfig6
-rw-r--r--net/mptcp/crypto.c2
-rw-r--r--net/mptcp/ctrl.c14
-rw-r--r--net/mptcp/mib.c10
-rw-r--r--net/mptcp/mib.h9
-rw-r--r--net/mptcp/mptcp_diag.c2
-rw-r--r--net/mptcp/options.c336
-rw-r--r--net/mptcp/pm.c136
-rw-r--r--net/mptcp/pm_netlink.c389
-rw-r--r--net/mptcp/protocol.c2223
-rw-r--r--net/mptcp/protocol.h237
-rw-r--r--net/mptcp/subflow.c277
-rw-r--r--net/mptcp/token.c2
-rw-r--r--net/ncsi/ncsi-manage.c5
-rw-r--r--net/ncsi/ncsi-netlink.c28
-rw-r--r--net/ncsi/ncsi-netlink.h3
-rw-r--r--net/ncsi/ncsi-rsp.c2
-rw-r--r--net/netfilter/Kconfig11
-rw-r--r--net/netfilter/Makefile1
-rw-r--r--net/netfilter/core.c129
-rw-r--r--net/netfilter/ipset/ip_set_core.c29
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h87
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c7
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c7
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c7
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c7
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c7
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c7
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c11
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c7
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c7
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c7
-rw-r--r--net/netfilter/ipvs/Kconfig1
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c18
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c25
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c44
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c10
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c7
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c6
-rw-r--r--net/netfilter/nf_conntrack_core.c28
-rw-r--r--net/netfilter/nf_conntrack_netlink.c36
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c13
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c13
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c38
-rw-r--r--net/netfilter/nf_conntrack_standalone.c9
-rw-r--r--net/netfilter/nf_dup_netdev.c1
-rw-r--r--net/netfilter/nf_flow_table_core.c16
-rw-r--r--net/netfilter/nf_flow_table_ip.c45
-rw-r--r--net/netfilter/nf_log_common.c12
-rw-r--r--net/netfilter/nf_nat_core.c1
-rw-r--r--net/netfilter/nf_nat_proto.c4
-rw-r--r--net/netfilter/nf_synproxy_core.c4
-rw-r--r--net/netfilter/nf_tables_api.c448
-rw-r--r--net/netfilter/nf_tables_core.c15
-rw-r--r--net/netfilter/nf_tables_offload.c23
-rw-r--r--net/netfilter/nfnetlink.c41
-rw-r--r--net/netfilter/nfnetlink_acct.c40
-rw-r--r--net/netfilter/nfnetlink_cthelper.c4
-rw-r--r--net/netfilter/nft_bitwise.c141
-rw-r--r--net/netfilter/nft_chain_filter.c35
-rw-r--r--net/netfilter/nft_chain_route.c4
-rw-r--r--net/netfilter/nft_cmp.c21
-rw-r--r--net/netfilter/nft_ct.c4
-rw-r--r--net/netfilter/nft_dynset.c192
-rw-r--r--net/netfilter/nft_fwd_netdev.c1
-rw-r--r--net/netfilter/nft_log.c2
-rw-r--r--net/netfilter/nft_meta.c16
-rw-r--r--net/netfilter/nft_payload.c98
-rw-r--r--net/netfilter/nft_reject.c12
-rw-r--r--net/netfilter/nft_reject_inet.c74
-rw-r--r--net/netfilter/nft_reject_netdev.c189
-rw-r--r--net/netfilter/nft_set_hash.c27
-rw-r--r--net/netfilter/nft_socket.c27
-rw-r--r--net/netfilter/utils.c4
-rw-r--r--net/netfilter/x_tables.c49
-rw-r--r--net/netfilter/xt_HMARK.c2
-rw-r--r--net/netfilter/xt_RATEEST.c3
-rw-r--r--net/netfilter/xt_nfacct.c2
-rw-r--r--net/netfilter/xt_recent.c12
-rw-r--r--net/netlabel/netlabel_calipso.c11
-rw-r--r--net/netlabel/netlabel_cipso_v4.c6
-rw-r--r--net/netlabel/netlabel_domainhash.c5
-rw-r--r--net/netlabel/netlabel_mgmt.c8
-rw-r--r--net/netlabel/netlabel_unlabeled.c23
-rw-r--r--net/netlink/af_netlink.c68
-rw-r--r--net/netlink/genetlink.c377
-rw-r--r--net/netlink/policy.c288
-rw-r--r--net/nfc/Kconfig2
-rw-r--r--net/nfc/core.c10
-rw-r--r--net/nfc/digital_core.c3
-rw-r--r--net/nfc/digital_dep.c3
-rw-r--r--net/nfc/nci/core.c20
-rw-r--r--net/nfc/nci/hci.c9
-rw-r--r--net/nfc/nci/ntf.c21
-rw-r--r--net/nfc/nci/rsp.c81
-rw-r--r--net/nfc/netlink.c5
-rw-r--r--net/nfc/rawsock.c2
-rw-r--r--net/openvswitch/actions.c59
-rw-r--r--net/openvswitch/conntrack.c24
-rw-r--r--net/openvswitch/datapath.c84
-rw-r--r--net/openvswitch/flow.c4
-rw-r--r--net/openvswitch/flow_netlink.c74
-rw-r--r--net/openvswitch/flow_table.c130
-rw-r--r--net/openvswitch/flow_table.h9
-rw-r--r--net/openvswitch/meter.c8
-rw-r--r--net/openvswitch/vport-internal_dev.c55
-rw-r--r--net/openvswitch/vport.c11
-rw-r--r--net/packet/af_packet.c89
-rw-r--r--net/packet/internal.h5
-rw-r--r--net/psample/psample.c6
-rw-r--r--net/qrtr/mhi.c6
-rw-r--r--net/qrtr/ns.c15
-rw-r--r--net/qrtr/qrtr.c65
-rw-r--r--net/qrtr/qrtr.h2
-rw-r--r--net/qrtr/tun.c6
-rw-r--r--net/rds/cong.c2
-rw-r--r--net/rds/ib.c10
-rw-r--r--net/rds/ib.h13
-rw-r--r--net/rds/ib_cm.c135
-rw-r--r--net/rds/ib_recv.c24
-rw-r--r--net/rds/ib_send.c8
-rw-r--r--net/rds/rdma.c5
-rw-r--r--net/rfkill/core.c44
-rw-r--r--net/rose/rose_loopback.c17
-rw-r--r--net/rxrpc/Makefile1
-rw-r--r--net/rxrpc/af_rxrpc.c13
-rw-r--r--net/rxrpc/ar-internal.h134
-rw-r--r--net/rxrpc/call_accept.c15
-rw-r--r--net/rxrpc/call_object.c45
-rw-r--r--net/rxrpc/conn_client.c1098
-rw-r--r--net/rxrpc/conn_event.c28
-rw-r--r--net/rxrpc/conn_object.c14
-rw-r--r--net/rxrpc/conn_service.c9
-rw-r--r--net/rxrpc/input.c2
-rw-r--r--net/rxrpc/insecure.c19
-rw-r--r--net/rxrpc/key.c664
-rw-r--r--net/rxrpc/local_object.c4
-rw-r--r--net/rxrpc/net_ns.c5
-rw-r--r--net/rxrpc/output.c6
-rw-r--r--net/rxrpc/proc.c2
-rw-r--r--net/rxrpc/recvmsg.c2
-rw-r--r--net/rxrpc/rtt.c1
-rw-r--r--net/rxrpc/rxkad.c264
-rw-r--r--net/rxrpc/security.c98
-rw-r--r--net/rxrpc/sendmsg.c45
-rw-r--r--net/rxrpc/server_key.c143
-rw-r--r--net/rxrpc/sysctl.c10
-rw-r--r--net/sched/Kconfig8
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c98
-rw-r--r--net/sched/act_bpf.c2
-rw-r--r--net/sched/act_ct.c21
-rw-r--r--net/sched/act_ctinfo.c5
-rw-r--r--net/sched/act_gate.c4
-rw-r--r--net/sched/act_ipt.c2
-rw-r--r--net/sched/act_mirred.c21
-rw-r--r--net/sched/act_mpls.c22
-rw-r--r--net/sched/act_simple.c4
-rw-r--r--net/sched/act_tunnel_key.c2
-rw-r--r--net/sched/act_vlan.c40
-rw-r--r--net/sched/cls_api.c42
-rw-r--r--net/sched/cls_flower.c26
-rw-r--r--net/sched/cls_rsvp.h2
-rw-r--r--net/sched/cls_tcindex.c8
-rw-r--r--net/sched/cls_u32.c19
-rw-r--r--net/sched/em_cmp.c2
-rw-r--r--net/sched/sch_api.c9
-rw-r--r--net/sched/sch_atm.c8
-rw-r--r--net/sched/sch_cbs.c1
-rw-r--r--net/sched/sch_choke.c2
-rw-r--r--net/sched/sch_fq_pie.c2
-rw-r--r--net/sched/sch_frag.c150
-rw-r--r--net/sched/sch_generic.c23
-rw-r--r--net/sched/sch_gred.c2
-rw-r--r--net/sched/sch_netem.c9
-rw-r--r--net/sched/sch_pie.c2
-rw-r--r--net/sched/sch_red.c2
-rw-r--r--net/sched/sch_sfq.c2
-rw-r--r--net/sched/sch_taprio.c21
-rw-r--r--net/sctp/Kconfig1
-rw-r--r--net/sctp/associola.c8
-rw-r--r--net/sctp/auth.c4
-rw-r--r--net/sctp/bind_addr.c2
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/input.c4
-rw-r--r--net/sctp/ipv6.c44
-rw-r--r--net/sctp/offload.c6
-rw-r--r--net/sctp/output.c22
-rw-r--r--net/sctp/proc.c16
-rw-r--r--net/sctp/protocol.c150
-rw-r--r--net/sctp/sm_make_chunk.c27
-rw-r--r--net/sctp/sm_sideeffect.c8
-rw-r--r--net/sctp/sm_statefuns.c52
-rw-r--r--net/sctp/socket.c116
-rw-r--r--net/sctp/sysctl.c62
-rw-r--r--net/sctp/transport.c6
-rw-r--r--net/sctp/ulpqueue.c2
-rw-r--r--net/smc/Makefile2
-rw-r--r--net/smc/af_smc.c921
-rw-r--r--net/smc/smc.h19
-rw-r--r--net/smc/smc_cdc.c10
-rw-r--r--net/smc/smc_clc.c505
-rw-r--r--net/smc/smc_clc.h256
-rw-r--r--net/smc/smc_close.c4
-rw-r--r--net/smc/smc_core.c493
-rw-r--r--net/smc/smc_core.h74
-rw-r--r--net/smc/smc_diag.c47
-rw-r--r--net/smc/smc_ib.c216
-rw-r--r--net/smc/smc_ib.h6
-rw-r--r--net/smc/smc_ism.c130
-rw-r--r--net/smc/smc_ism.h8
-rw-r--r--net/smc/smc_llc.c21
-rw-r--r--net/smc/smc_netlink.c85
-rw-r--r--net/smc/smc_netlink.h32
-rw-r--r--net/smc/smc_netns.h1
-rw-r--r--net/smc/smc_pnet.c176
-rw-r--r--net/smc/smc_pnet.h15
-rw-r--r--net/smc/smc_tx.c10
-rw-r--r--net/smc/smc_wr.c14
-rw-r--r--net/socket.c76
-rw-r--r--net/sunrpc/Kconfig1
-rw-r--r--net/sunrpc/addr.c2
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c30
-rw-r--r--net/sunrpc/auth_gss/auth_gss_internal.h45
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c276
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c126
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seqnum.c87
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c65
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c15
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c3
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c27
-rw-r--r--net/sunrpc/backchannel_rqst.c4
-rw-r--r--net/sunrpc/cache.c60
-rw-r--r--net/sunrpc/clnt.c83
-rw-r--r--net/sunrpc/debugfs.c4
-rw-r--r--net/sunrpc/rpc_pipe.c3
-rw-r--r--net/sunrpc/rpcb_clnt.c129
-rw-r--r--net/sunrpc/sched.c117
-rw-r--r--net/sunrpc/socklib.c2
-rw-r--r--net/sunrpc/sunrpc.h2
-rw-r--r--net/sunrpc/svc.c16
-rw-r--r--net/sunrpc/svc_xprt.c6
-rw-r--r--net/sunrpc/svcsock.c95
-rw-r--r--net/sunrpc/sysctl.c19
-rw-r--r--net/sunrpc/xdr.c999
-rw-r--r--net/sunrpc/xprt.c141
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c6
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c83
-rw-r--r--net/sunrpc/xprtrdma/module.c1
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c72
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c14
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_pcl.c306
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c314
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c602
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c561
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c2
-rw-r--r--net/sunrpc/xprtrdma/transport.c15
-rw-r--r--net/sunrpc/xprtrdma/verbs.c30
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h9
-rw-r--r--net/sunrpc/xprtsock.c12
-rw-r--r--net/switchdev/switchdev.c23
-rw-r--r--net/tipc/addr.c7
-rw-r--r--net/tipc/addr.h1
-rw-r--r--net/tipc/bcast.c10
-rw-r--r--net/tipc/bearer.c27
-rw-r--r--net/tipc/bearer.h10
-rw-r--r--net/tipc/core.c8
-rw-r--r--net/tipc/core.h23
-rw-r--r--net/tipc/crypto.c1036
-rw-r--r--net/tipc/crypto.h49
-rw-r--r--net/tipc/discover.c5
-rw-r--r--net/tipc/group.c3
-rw-r--r--net/tipc/group.h3
-rw-r--r--net/tipc/link.c69
-rw-r--r--net/tipc/msg.c31
-rw-r--r--net/tipc/msg.h8
-rw-r--r--net/tipc/name_distr.c58
-rw-r--r--net/tipc/name_distr.h2
-rw-r--r--net/tipc/name_table.c57
-rw-r--r--net/tipc/name_table.h9
-rw-r--r--net/tipc/net.c22
-rw-r--r--net/tipc/net.h1
-rw-r--r--net/tipc/netlink.c2
-rw-r--r--net/tipc/netlink_compat.c25
-rw-r--r--net/tipc/node.c162
-rw-r--r--net/tipc/node.h2
-rw-r--r--net/tipc/socket.c224
-rw-r--r--net/tipc/socket.h2
-rw-r--r--net/tipc/subscr.c13
-rw-r--r--net/tipc/subscr.h16
-rw-r--r--net/tipc/sysctl.c9
-rw-r--r--net/tipc/topsrv.c17
-rw-r--r--net/tipc/trace.c2
-rw-r--r--net/tipc/udp_media.c9
-rw-r--r--net/tls/tls_device.c59
-rw-r--r--net/tls/tls_device_fallback.c13
-rw-r--r--net/tls/tls_main.c30
-rw-r--r--net/tls/tls_proc.c3
-rw-r--r--net/tls/tls_sw.c42
-rw-r--r--net/unix/af_unix.c3
-rw-r--r--net/unix/scm.c1
-rw-r--r--net/vmw_vsock/af_vsock.c58
-rw-r--r--net/vmw_vsock/hyperv_transport.c4
-rw-r--r--net/vmw_vsock/virtio_transport_common.c12
-rw-r--r--net/vmw_vsock/vsock_addr.c4
-rw-r--r--net/wimax/Kconfig40
-rw-r--r--net/wimax/Makefile13
-rw-r--r--net/wimax/debug-levels.h29
-rw-r--r--net/wimax/debugfs.c38
-rw-r--r--net/wimax/id-table.c130
-rw-r--r--net/wimax/op-msg.c391
-rw-r--r--net/wimax/op-reset.c108
-rw-r--r--net/wimax/op-rfkill.c431
-rw-r--r--net/wimax/op-state-get.c52
-rw-r--r--net/wimax/stack.c609
-rw-r--r--net/wimax/wimax-internal.h85
-rw-r--r--net/wireless/Kconfig1
-rw-r--r--net/wireless/chan.c141
-rw-r--r--net/wireless/core.c73
-rw-r--r--net/wireless/core.h16
-rw-r--r--net/wireless/lib80211.c2
-rw-r--r--net/wireless/mlme.c40
-rw-r--r--net/wireless/nl80211.c836
-rw-r--r--net/wireless/nl80211.h8
-rw-r--r--net/wireless/radiotap.c1
-rw-r--r--net/wireless/rdev-ops.h22
-rw-r--r--net/wireless/reg.c350
-rw-r--r--net/wireless/scan.c586
-rw-r--r--net/wireless/sme.c2
-rw-r--r--net/wireless/trace.h36
-rw-r--r--net/wireless/util.c121
-rw-r--r--net/wireless/wext-compat.c156
-rw-r--r--net/wireless/wext-core.c5
-rw-r--r--net/x25/af_x25.c53
-rw-r--r--net/x25/x25_dev.c13
-rw-r--r--net/x25/x25_link.c52
-rw-r--r--net/x25/x25_route.c10
-rw-r--r--net/xdp/xdp_umem.c222
-rw-r--r--net/xdp/xdp_umem.h8
-rw-r--r--net/xdp/xsk.c378
-rw-r--r--net/xdp/xsk.h13
-rw-r--r--net/xdp/xsk_buff_pool.c381
-rw-r--r--net/xdp/xsk_diag.c20
-rw-r--r--net/xdp/xsk_queue.h120
-rw-r--r--net/xdp/xskmap.c50
-rw-r--r--net/xfrm/Kconfig11
-rw-r--r--net/xfrm/Makefile1
-rw-r--r--net/xfrm/xfrm_compat.c626
-rw-r--r--net/xfrm/xfrm_input.c9
-rw-r--r--net/xfrm/xfrm_interface.c56
-rw-r--r--net/xfrm/xfrm_policy.c30
-rw-r--r--net/xfrm/xfrm_state.c93
-rw-r--r--net/xfrm/xfrm_user.c184
759 files changed, 42725 insertions, 22681 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index d4bcfd8f95bf..8b644113715e 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -51,12 +51,16 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg,
__be16 vlan_proto, u16 vlan_id)
{
struct net_device **array;
- unsigned int pidx, vidx;
+ unsigned int vidx;
unsigned int size;
+ int pidx;
ASSERT_RTNL();
pidx = vlan_proto_idx(vlan_proto);
+ if (pidx < 0)
+ return -EINVAL;
+
vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN;
array = vg->vlan_devices_arrays[pidx][vidx];
if (array != NULL)
@@ -280,8 +284,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
return 0;
out_free_newdev:
- if (new_dev->reg_state == NETREG_UNINITIALIZED)
- free_netdev(new_dev);
+ free_netdev(new_dev);
return err;
}
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index bb7ec1a3915d..953405362795 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -36,7 +36,7 @@ struct vlan_info {
struct rcu_head rcu;
};
-static inline unsigned int vlan_proto_idx(__be16 proto)
+static inline int vlan_proto_idx(__be16 proto)
{
switch (proto) {
case htons(ETH_P_8021Q):
@@ -44,8 +44,8 @@ static inline unsigned int vlan_proto_idx(__be16 proto)
case htons(ETH_P_8021AD):
return VLAN_PROTO_8021AD;
default:
- BUG();
- return 0;
+ WARN(1, "invalid VLAN protocol: 0x%04x\n", ntohs(proto));
+ return -EINVAL;
}
}
@@ -64,17 +64,24 @@ static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
__be16 vlan_proto,
u16 vlan_id)
{
- return __vlan_group_get_device(vg, vlan_proto_idx(vlan_proto), vlan_id);
+ int pidx = vlan_proto_idx(vlan_proto);
+
+ if (pidx < 0)
+ return NULL;
+
+ return __vlan_group_get_device(vg, pidx, vlan_id);
}
static inline void vlan_group_set_device(struct vlan_group *vg,
__be16 vlan_proto, u16 vlan_id,
struct net_device *dev)
{
+ int pidx = vlan_proto_idx(vlan_proto);
struct net_device **array;
- if (!vg)
+
+ if (!vg || pidx < 0)
return;
- array = vg->vlan_devices_arrays[vlan_proto_idx(vlan_proto)]
+ array = vg->vlan_devices_arrays[pidx]
[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
}
diff --git a/net/9p/client.c b/net/9p/client.c
index 09f1ec589b80..4f62f299da0c 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -412,8 +412,9 @@ static void p9_tag_cleanup(struct p9_client *c)
/**
* p9_client_cb - call back from transport to client
- * c: client state
- * req: request received
+ * @c: client state
+ * @req: request received
+ * @status: request status, one of REQ_STATUS_*
*
*/
void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
@@ -555,6 +556,7 @@ out_err:
* p9_check_zc_errors - check 9p packet for error return and process it
* @c: current client instance
* @req: request to parse and check for error conditions
+ * @uidata: external buffer containing error
* @in_hdrlen: Size of response protocol buffer.
*
* returns error code if one is discovered, otherwise returns 0
@@ -901,6 +903,7 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
fid->clnt = clnt;
fid->rdir = NULL;
fid->fid = 0;
+ refcount_set(&fid->count, 1);
idr_preload(GFP_KERNEL);
spin_lock_irq(&clnt->lock);
@@ -908,7 +911,6 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
GFP_NOWAIT);
spin_unlock_irq(&clnt->lock);
idr_preload_end();
-
if (!ret)
return fid;
@@ -1187,7 +1189,6 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n",
oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL);
-
req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid,
nwname, wnames);
if (IS_ERR(req)) {
@@ -1219,7 +1220,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
if (nwname)
memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid));
else
- fid->qid = oldfid->qid;
+ memmove(&fid->qid, &oldfid->qid, sizeof(struct p9_qid));
kfree(wqids);
return fid;
@@ -1272,6 +1273,7 @@ int p9_client_open(struct p9_fid *fid, int mode)
p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type,
(unsigned long long)qid.path, qid.version, iounit);
+ memmove(&fid->qid, &qid, sizeof(struct p9_qid));
fid->mode = mode;
fid->iounit = iounit;
@@ -1317,6 +1319,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32
(unsigned long long)qid->path,
qid->version, iounit);
+ memmove(&ofid->qid, qid, sizeof(struct p9_qid));
ofid->mode = mode;
ofid->iounit = iounit;
@@ -1362,6 +1365,7 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode,
(unsigned long long)qid.path,
qid.version, iounit);
+ memmove(&fid->qid, &qid, sizeof(struct p9_qid));
fid->mode = mode;
fid->iounit = iounit;
@@ -1458,12 +1462,14 @@ int p9_client_clunk(struct p9_fid *fid)
struct p9_req_t *req;
int retries = 0;
- if (!fid) {
- pr_warn("%s (%d): Trying to clunk with NULL fid\n",
+ if (!fid || IS_ERR(fid)) {
+ pr_warn("%s (%d): Trying to clunk with invalid fid\n",
__func__, task_pid_nr(current));
dump_stack();
return 0;
}
+ if (!refcount_dec_and_test(&fid->count))
+ return 0;
again:
p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", fid->fid,
diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c
index 3dff68f05fb9..6ea5ea548cd4 100644
--- a/net/9p/trans_common.c
+++ b/net/9p/trans_common.c
@@ -17,7 +17,9 @@
#include "trans_common.h"
/**
- * p9_release_pages - Release pages after the transaction.
+ * p9_release_pages - Release pages after the transaction.
+ * @pages: array of pages to be put
+ * @nr_pages: size of array
*/
void p9_release_pages(struct page **pages, int nr_pages)
{
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index c0762a302162..fa158397bb63 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -45,7 +45,7 @@ static struct p9_trans_module p9_fd_trans;
* @rfd: file descriptor for reading (trans=fd)
* @wfd: file descriptor for writing (trans=fd)
* @port: port to connect to (trans=tcp)
- *
+ * @privport: port is privileged
*/
struct p9_fd_opts {
@@ -95,6 +95,8 @@ struct p9_poll_wait {
* @err: error state
* @req_list: accounting for requests which have been sent
* @unsent_req_list: accounting for requests that haven't been sent
+ * @rreq: read request
+ * @wreq: write request
* @req: current request being processed (if any)
* @tmp_buf: temporary buffer to read in header
* @rc: temporary fcall for reading current frame
@@ -1023,7 +1025,7 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
csocket = NULL;
- if (addr == NULL)
+ if (!addr || !strlen(addr))
return -EINVAL;
if (strlen(addr) >= UNIX_PATH_MAX) {
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 2885ff9c76f0..af0a8a6cd3fd 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -99,6 +99,7 @@ struct p9_rdma_req;
/**
* struct p9_rdma_context - Keeps track of in-process WR
*
+ * @cqe: completion queue entry
* @busa: Bus address to unmap when the WR completes
* @req: Keeps track of requests (send)
* @rc: Keepts track of replies (receive)
@@ -115,6 +116,7 @@ struct p9_rdma_context {
/**
* struct p9_rdma_opts - Collection of mount options
* @port: port of connection
+ * @privport: Whether a privileged port may be used
* @sq_depth: The requested depth of the SQ. This really doesn't need
* to be any deeper than the number of threads used in the client
* @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index a3cd90a74012..93f2f8654882 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -50,7 +50,11 @@ static atomic_t vp_pinned = ATOMIC_INIT(0);
* @client: client instance
* @vdev: virtio dev associated with this channel
* @vq: virtio queue associated with this channel
+ * @ring_bufs_avail: flag to indicate there is some available in the ring buf
+ * @vc_wq: wait queue for waiting for thing to be added to ring buf
+ * @p9_max_pages: maximum number of pinned pages
* @sg: scatter gather list which is used to pack a request (protected?)
+ * @chan_list: linked list of channels
*
* We keep all per-channel information in a structure.
* This structure is allocated within the devices dev->mem space.
@@ -74,8 +78,8 @@ struct virtio_chan {
unsigned long p9_max_pages;
/* Scatterlist: can be too big for stack. */
struct scatterlist sg[VIRTQUEUE_NUM];
- /*
- * tag name to identify a mount null terminated
+ /**
+ * @tag: name to identify a mount null terminated
*/
char *tag;
@@ -204,6 +208,7 @@ static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req)
* this takes a list of pages.
* @sg: scatter/gather list to pack into
* @start: which segment of the sg_list to start at
+ * @limit: maximum number of pages in sg list.
* @pdata: a list of pages to add into sg.
* @nr_pages: number of pages to pack into the scatter/gather list
* @offs: amount of data in the beginning of first page _not_ to pack
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index bc8807d9281f..f4fea28e05da 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -451,13 +451,13 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev,
char str[16];
BUILD_BUG_ON(XEN_9PFS_NUM_RINGS > 9);
- sprintf(str, "ring-ref%u", i);
+ sprintf(str, "ring-ref%d", i);
ret = xenbus_printf(xbt, dev->nodename, str, "%d",
priv->rings[i].ref);
if (ret)
goto error_xenbus;
- sprintf(str, "event-channel-%u", i);
+ sprintf(str, "event-channel-%d", i);
ret = xenbus_printf(xbt, dev->nodename, str, "%u",
priv->rings[i].evtchn);
if (ret)
diff --git a/net/Kconfig b/net/Kconfig
index 3831206977a1..f4c32d982af6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -386,8 +386,6 @@ source "net/mac80211/Kconfig"
endif # WIRELESS
-source "net/wimax/Kconfig"
-
source "net/rfkill/Kconfig"
source "net/9p/Kconfig"
source "net/caif/Kconfig"
@@ -434,7 +432,6 @@ config NET_SOCK_MSG
config NET_DEVLINK
bool
default n
- imply NET_DROP_MONITOR
config PAGE_POOL
bool
diff --git a/net/Makefile b/net/Makefile
index 5744bf1997fd..d96b0aa8f39f 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -66,7 +66,6 @@ obj-$(CONFIG_MAC802154) += mac802154/
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_SYSCTL) += sysctl_net.o
endif
-obj-$(CONFIG_WIMAX) += wimax/
obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
obj-$(CONFIG_CEPH_LIB) += ceph/
obj-$(CONFIG_BATMAN_ADV) += batman-adv/
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 45f584171de7..be18af481d7d 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -44,15 +44,15 @@ int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME;
/* Lists of aarp entries */
/**
* struct aarp_entry - AARP entry
- * @last_sent - Last time we xmitted the aarp request
- * @packet_queue - Queue of frames wait for resolution
- * @status - Used for proxy AARP
- * expires_at - Entry expiry time
- * target_addr - DDP Address
- * dev - Device to use
- * hwaddr - Physical i/f address of target/router
- * xmit_count - When this hits 10 we give up
- * next - Next entry in chain
+ * @last_sent: Last time we xmitted the aarp request
+ * @packet_queue: Queue of frames wait for resolution
+ * @status: Used for proxy AARP
+ * @expires_at: Entry expiry time
+ * @target_addr: DDP Address
+ * @dev: Device to use
+ * @hwaddr: Physical i/f address of target/router
+ * @xmit_count: When this hits 10 we give up
+ * @next: Next entry in chain
*/
struct aarp_entry {
/* These first two are only used for unresolved entries */
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 1d48708c5a2e..ca1a0d07a087 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1407,9 +1407,10 @@ drop:
/**
* atalk_rcv - Receive a packet (in skb) from device dev
- * @skb - packet received
- * @dev - network device where the packet comes from
- * @pt - packet type
+ * @skb: packet received
+ * @dev: network device where the packet comes from
+ * @pt: packet type
+ * @orig_dev: the original receive net device
*
* Receive a packet (in skb) from device dev. This has come from the SNAP
* decoder, and on entry skb->transport_header is the DDP header, skb->len
diff --git a/net/atm/lec.c b/net/atm/lec.c
index b570ef919c28..7226c784dbe0 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -954,9 +954,8 @@ static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct lec_state *state = seq->private;
- v = lec_get_idx(state, 1);
- *pos += !!PTR_ERR(v);
- return v;
+ ++*pos;
+ return lec_get_idx(state, 1);
}
static int lec_seq_show(struct seq_file *seq, void *v)
@@ -1070,7 +1069,7 @@ module_exit(lane_module_cleanup);
/*
* LANE2: 3.1.3, LE_RESOLVE.request
* Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs.
- * If sizeoftlvs == NULL the default TLVs associated with with this
+ * If sizeoftlvs == NULL the default TLVs associated with this
* lec will be used.
* If dst_mac == NULL, targetless LE_ARP will be sent
*/
diff --git a/net/atm/raw.c b/net/atm/raw.c
index b3ba44aab0ee..2b5f78a7ec3e 100644
--- a/net/atm/raw.c
+++ b/net/atm/raw.c
@@ -54,6 +54,8 @@ static int atm_send_aal0(struct atm_vcc *vcc, struct sk_buff *skb)
kfree_skb(skb);
return -EADDRNOTAVAIL;
}
+ if (vcc->dev->ops->send_bh)
+ return vcc->dev->ops->send_bh(vcc, skb);
return vcc->dev->ops->send(vcc, skb);
}
@@ -71,7 +73,10 @@ int atm_init_aal34(struct atm_vcc *vcc)
vcc->push = atm_push_raw;
vcc->pop = atm_pop_raw;
vcc->push_oam = NULL;
- vcc->send = vcc->dev->ops->send;
+ if (vcc->dev->ops->send_bh)
+ vcc->send = vcc->dev->ops->send_bh;
+ else
+ vcc->send = vcc->dev->ops->send;
return 0;
}
@@ -80,7 +85,10 @@ int atm_init_aal5(struct atm_vcc *vcc)
vcc->push = atm_push_raw;
vcc->pop = atm_pop_raw;
vcc->push_oam = NULL;
- vcc->send = vcc->dev->ops->send;
+ if (vcc->dev->ops->send_bh)
+ vcc->send = vcc->dev->ops->send_bh;
+ else
+ vcc->send = vcc->dev->ops->send;
return 0;
}
EXPORT_SYMBOL(atm_init_aal5);
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index fbd0c5e7b299..5de06ab8ed75 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -52,7 +52,7 @@ static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg)
msg->type = as_okay;
}
/*
- * Should probably just turn around the old skb. But the, the buffer
+ * Should probably just turn around the old skb. But then, the buffer
* space accounting needs to follow the change too. Maybe later.
*/
while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL)))
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index c762758a4649..993afd5ff7bb 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -76,37 +76,14 @@ config BATMAN_ADV_MCAST
reduce the air overhead while improving the reliability of
multicast messages.
-config BATMAN_ADV_DEBUGFS
- bool "batman-adv debugfs entries"
- depends on BATMAN_ADV
- depends on DEBUG_FS
- help
- Enable this to export routing related debug tables via debugfs.
- The information for each soft-interface and used hard-interface can be
- found under batman_adv/
-
- If unsure, say N.
-
config BATMAN_ADV_DEBUG
bool "B.A.T.M.A.N. debugging"
depends on BATMAN_ADV
help
This is an option for use by developers; most people should
say N here. This enables compilation of support for
- outputting debugging information to the debugfs log or tracing
- buffer. The output is controlled via the batadv netdev specific
- log_level setting.
-
-config BATMAN_ADV_SYSFS
- bool "batman-adv sysfs entries"
- depends on BATMAN_ADV
- help
- Say Y here if you want to enable batman-adv device configuration and
- status interface through sysfs attributes. It is replaced by the
- batadv generic netlink family but still used by various userspace
- tools and scripts.
-
- If unsure, say Y.
+ outputting debugging information to the tracing buffer. The output is
+ controlled via the batadv netdev specific log_level setting.
config BATMAN_ADV_TRACING
bool "B.A.T.M.A.N. tracing support"
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index daa49af7ff40..8010c34b987c 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -11,14 +11,12 @@ batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o
batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o
batman-adv-y += bitarray.o
batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o
-batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += debugfs.o
batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o
batman-adv-y += fragmentation.o
batman-adv-y += gateway_client.o
batman-adv-y += gateway_common.o
batman-adv-y += hard-interface.o
batman-adv-y += hash.o
-batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += icmp_socket.o
batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o
batman-adv-y += main.o
batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o
@@ -28,7 +26,6 @@ batman-adv-y += originator.o
batman-adv-y += routing.o
batman-adv-y += send.o
batman-adv-y += soft-interface.o
-batman-adv-$(CONFIG_BATMAN_ADV_SYSFS) += sysfs.o
batman-adv-$(CONFIG_BATMAN_ADV_TRACING) += trace.o
batman-adv-y += tp_meter.o
batman-adv-y += translation-table.o
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 382fbe51fd34..c5f404f6892f 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -11,7 +11,6 @@
#include <linux/moduleparam.h>
#include <linux/netlink.h>
#include <linux/printk.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/string.h>
@@ -34,7 +33,13 @@ void batadv_algo_init(void)
INIT_HLIST_HEAD(&batadv_algo_list);
}
-static struct batadv_algo_ops *batadv_algo_get(char *name)
+/**
+ * batadv_algo_get() - Search for algorithm with specific name
+ * @name: algorithm name to find
+ *
+ * Return: Pointer to batadv_algo_ops on success, NULL otherwise
+ */
+struct batadv_algo_ops *batadv_algo_get(const char *name)
{
struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp;
@@ -97,7 +102,7 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
*
* Return: 0 on success or negative error number in case of failure
*/
-int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
+int batadv_algo_select(struct batadv_priv *bat_priv, const char *name)
{
struct batadv_algo_ops *bat_algo_ops;
@@ -110,29 +115,6 @@ int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
return 0;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-/**
- * batadv_algo_seq_print_text() - Print the supported algorithms in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct batadv_algo_ops *bat_algo_ops;
-
- seq_puts(seq, "Available routing algorithms:\n");
-
- hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) {
- seq_printf(seq, " * %s\n", bat_algo_ops->name);
- }
-
- return 0;
-}
-#endif
-
static int batadv_param_set_ra(const char *val, const struct kernel_param *kp)
{
struct batadv_algo_ops *bat_algo_ops;
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 686a60bc9492..43b045ac8ac7 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -10,7 +10,6 @@
#include "main.h"
#include <linux/netlink.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/types.h>
@@ -18,9 +17,9 @@ extern char batadv_routing_algo[];
extern struct list_head batadv_hardif_list;
void batadv_algo_init(void);
+struct batadv_algo_ops *batadv_algo_get(const char *name);
int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops);
-int batadv_algo_select(struct batadv_priv *bat_priv, char *name);
-int batadv_algo_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_algo_select(struct batadv_priv *bat_priv, const char *name);
int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb);
#endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index a4faf5f904d9..168621c9a081 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -27,11 +27,11 @@
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/pkt_sched.h>
+#include <linux/prandom.h>
#include <linux/printk.h>
#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -1779,106 +1779,6 @@ free_skb:
return ret;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_iv_ogm_orig_print_neigh() - print neighbors for the originator table
- * @orig_node: the orig_node for which the neighbors are printed
- * @if_outgoing: outgoing interface for these entries
- * @seq: debugfs table seq_file struct
- *
- * Must be called while holding an rcu lock.
- */
-static void
-batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node,
- struct batadv_hard_iface *if_outgoing,
- struct seq_file *seq)
-{
- struct batadv_neigh_node *neigh_node;
- struct batadv_neigh_ifinfo *n_ifinfo;
-
- hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
- n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
- if (!n_ifinfo)
- continue;
-
- seq_printf(seq, " %pM (%3i)",
- neigh_node->addr,
- n_ifinfo->bat_iv.tq_avg);
-
- batadv_neigh_ifinfo_put(n_ifinfo);
- }
-}
-
-/**
- * batadv_iv_ogm_orig_print() - print the originator table
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: debugfs table seq_file struct
- * @if_outgoing: the outgoing interface for which this should be printed
- */
-static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv,
- struct seq_file *seq,
- struct batadv_hard_iface *if_outgoing)
-{
- struct batadv_neigh_node *neigh_node;
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- int last_seen_msecs, last_seen_secs;
- struct batadv_orig_node *orig_node;
- struct batadv_neigh_ifinfo *n_ifinfo;
- unsigned long last_seen_jiffies;
- struct hlist_head *head;
- int batman_count = 0;
- u32 i;
-
- seq_puts(seq,
- " Originator last-seen (#/255) Nexthop [outgoingIF]: Potential nexthops ...\n");
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- neigh_node = batadv_orig_router_get(orig_node,
- if_outgoing);
- if (!neigh_node)
- continue;
-
- n_ifinfo = batadv_neigh_ifinfo_get(neigh_node,
- if_outgoing);
- if (!n_ifinfo)
- goto next;
-
- if (n_ifinfo->bat_iv.tq_avg == 0)
- goto next;
-
- last_seen_jiffies = jiffies - orig_node->last_seen;
- last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
- last_seen_secs = last_seen_msecs / 1000;
- last_seen_msecs = last_seen_msecs % 1000;
-
- seq_printf(seq, "%pM %4i.%03is (%3i) %pM [%10s]:",
- orig_node->orig, last_seen_secs,
- last_seen_msecs, n_ifinfo->bat_iv.tq_avg,
- neigh_node->addr,
- neigh_node->if_incoming->net_dev->name);
-
- batadv_iv_ogm_orig_print_neigh(orig_node, if_outgoing,
- seq);
- seq_putc(seq, '\n');
- batman_count++;
-
-next:
- batadv_neigh_node_put(neigh_node);
- if (n_ifinfo)
- batadv_neigh_ifinfo_put(n_ifinfo);
- }
- rcu_read_unlock();
- }
-
- if (batman_count == 0)
- seq_puts(seq, "No batman nodes in range ...\n");
-}
-#endif
-
/**
* batadv_iv_ogm_neigh_get_tq_avg() - Get the TQ average for a neighbour on a
* given outgoing interface.
@@ -2108,59 +2008,6 @@ batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
cb->args[2] = sub;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_iv_hardif_neigh_print() - print a single hop neighbour node
- * @seq: neighbour table seq_file struct
- * @hardif_neigh: hardif neighbour information
- */
-static void
-batadv_iv_hardif_neigh_print(struct seq_file *seq,
- struct batadv_hardif_neigh_node *hardif_neigh)
-{
- int last_secs, last_msecs;
-
- last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000;
- last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000;
-
- seq_printf(seq, " %10s %pM %4i.%03is\n",
- hardif_neigh->if_incoming->net_dev->name,
- hardif_neigh->addr, last_secs, last_msecs);
-}
-
-/**
- * batadv_iv_ogm_neigh_print() - print the single hop neighbour list
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: neighbour table seq_file struct
- */
-static void batadv_iv_neigh_print(struct batadv_priv *bat_priv,
- struct seq_file *seq)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_hardif_neigh_node *hardif_neigh;
- struct batadv_hard_iface *hard_iface;
- int batman_count = 0;
-
- seq_puts(seq, " IF Neighbor last-seen\n");
-
- rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->soft_iface != net_dev)
- continue;
-
- hlist_for_each_entry_rcu(hardif_neigh,
- &hard_iface->neigh_list, list) {
- batadv_iv_hardif_neigh_print(seq, hardif_neigh);
- batman_count++;
- }
- }
- rcu_read_unlock();
-
- if (batman_count == 0)
- seq_puts(seq, "No batman nodes in range ...\n");
-}
-#endif
-
/**
* batadv_iv_ogm_neigh_diff() - calculate tq difference of two neighbors
* @neigh1: the first neighbor object of the comparison
@@ -2556,72 +2403,6 @@ out:
return ret;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/* fails if orig_node has no router */
-static int batadv_iv_gw_write_buffer_text(struct batadv_priv *bat_priv,
- struct seq_file *seq,
- const struct batadv_gw_node *gw_node)
-{
- struct batadv_gw_node *curr_gw;
- struct batadv_neigh_node *router;
- struct batadv_neigh_ifinfo *router_ifinfo = NULL;
- int ret = -1;
-
- router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
- if (!router)
- goto out;
-
- router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
- if (!router_ifinfo)
- goto out;
-
- curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
-
- seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n",
- (curr_gw == gw_node ? "=>" : " "),
- gw_node->orig_node->orig,
- router_ifinfo->bat_iv.tq_avg, router->addr,
- router->if_incoming->net_dev->name,
- gw_node->bandwidth_down / 10,
- gw_node->bandwidth_down % 10,
- gw_node->bandwidth_up / 10,
- gw_node->bandwidth_up % 10);
- ret = seq_has_overflowed(seq) ? -1 : 0;
-
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
-out:
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- if (router)
- batadv_neigh_node_put(router);
- return ret;
-}
-
-static void batadv_iv_gw_print(struct batadv_priv *bat_priv,
- struct seq_file *seq)
-{
- struct batadv_gw_node *gw_node;
- int gw_count = 0;
-
- seq_puts(seq,
- " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
- /* fails if orig_node has no router */
- if (batadv_iv_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
- continue;
-
- gw_count++;
- }
- rcu_read_unlock();
-
- if (gw_count == 0)
- seq_puts(seq, "No gateways in range ...\n");
-}
-#endif
-
/**
* batadv_iv_gw_dump_entry() - Dump a gateway into a message
* @msg: Netlink message to dump into
@@ -2746,24 +2527,15 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
.neigh = {
.cmp = batadv_iv_ogm_neigh_cmp,
.is_similar_or_better = batadv_iv_ogm_neigh_is_sob,
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_iv_neigh_print,
-#endif
.dump = batadv_iv_ogm_neigh_dump,
},
.orig = {
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_iv_ogm_orig_print,
-#endif
.dump = batadv_iv_ogm_orig_dump,
},
.gw = {
.init_sel_class = batadv_iv_init_sel_class,
.get_best_gw_node = batadv_iv_gw_get_best_gw_node,
.is_eligible = batadv_iv_gw_is_eligible,
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_iv_gw_print,
-#endif
.dump = batadv_iv_gw_dump,
},
};
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 0ecaf1bb0068..e4455babe4c2 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -13,14 +13,13 @@
#include <linux/if_ether.h>
#include <linux/init.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
@@ -119,92 +118,6 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
batadv_v_elp_throughput_metric_update);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_v_orig_print_neigh() - print neighbors for the originator table
- * @orig_node: the orig_node for which the neighbors are printed
- * @if_outgoing: outgoing interface for these entries
- * @seq: debugfs table seq_file struct
- *
- * Must be called while holding an rcu lock.
- */
-static void
-batadv_v_orig_print_neigh(struct batadv_orig_node *orig_node,
- struct batadv_hard_iface *if_outgoing,
- struct seq_file *seq)
-{
- struct batadv_neigh_node *neigh_node;
- struct batadv_neigh_ifinfo *n_ifinfo;
-
- hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
- n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
- if (!n_ifinfo)
- continue;
-
- seq_printf(seq, " %pM (%9u.%1u)",
- neigh_node->addr,
- n_ifinfo->bat_v.throughput / 10,
- n_ifinfo->bat_v.throughput % 10);
-
- batadv_neigh_ifinfo_put(n_ifinfo);
- }
-}
-
-/**
- * batadv_v_hardif_neigh_print() - print a single ELP neighbour node
- * @seq: neighbour table seq_file struct
- * @hardif_neigh: hardif neighbour information
- */
-static void
-batadv_v_hardif_neigh_print(struct seq_file *seq,
- struct batadv_hardif_neigh_node *hardif_neigh)
-{
- int last_secs, last_msecs;
- u32 throughput;
-
- last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000;
- last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000;
- throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
-
- seq_printf(seq, "%pM %4i.%03is (%9u.%1u) [%10s]\n",
- hardif_neigh->addr, last_secs, last_msecs, throughput / 10,
- throughput % 10, hardif_neigh->if_incoming->net_dev->name);
-}
-
-/**
- * batadv_v_neigh_print() - print the single hop neighbour list
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: neighbour table seq_file struct
- */
-static void batadv_v_neigh_print(struct batadv_priv *bat_priv,
- struct seq_file *seq)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_hardif_neigh_node *hardif_neigh;
- struct batadv_hard_iface *hard_iface;
- int batman_count = 0;
-
- seq_puts(seq,
- " Neighbor last-seen ( throughput) [ IF]\n");
-
- rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->soft_iface != net_dev)
- continue;
-
- hlist_for_each_entry_rcu(hardif_neigh,
- &hard_iface->neigh_list, list) {
- batadv_v_hardif_neigh_print(seq, hardif_neigh);
- batman_count++;
- }
- }
- rcu_read_unlock();
-
- if (batman_count == 0)
- seq_puts(seq, "No batman nodes in range ...\n");
-}
-#endif
-
/**
* batadv_v_neigh_dump_neigh() - Dump a neighbour into a message
* @msg: Netlink message to dump into
@@ -337,75 +250,6 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
cb->args[1] = idx;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_v_orig_print() - print the originator table
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: debugfs table seq_file struct
- * @if_outgoing: the outgoing interface for which this should be printed
- */
-static void batadv_v_orig_print(struct batadv_priv *bat_priv,
- struct seq_file *seq,
- struct batadv_hard_iface *if_outgoing)
-{
- struct batadv_neigh_node *neigh_node;
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- int last_seen_msecs, last_seen_secs;
- struct batadv_orig_node *orig_node;
- struct batadv_neigh_ifinfo *n_ifinfo;
- unsigned long last_seen_jiffies;
- struct hlist_head *head;
- int batman_count = 0;
- u32 i;
-
- seq_puts(seq,
- " Originator last-seen ( throughput) Nexthop [outgoingIF]: Potential nexthops ...\n");
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- neigh_node = batadv_orig_router_get(orig_node,
- if_outgoing);
- if (!neigh_node)
- continue;
-
- n_ifinfo = batadv_neigh_ifinfo_get(neigh_node,
- if_outgoing);
- if (!n_ifinfo)
- goto next;
-
- last_seen_jiffies = jiffies - orig_node->last_seen;
- last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
- last_seen_secs = last_seen_msecs / 1000;
- last_seen_msecs = last_seen_msecs % 1000;
-
- seq_printf(seq, "%pM %4i.%03is (%9u.%1u) %pM [%10s]:",
- orig_node->orig, last_seen_secs,
- last_seen_msecs,
- n_ifinfo->bat_v.throughput / 10,
- n_ifinfo->bat_v.throughput % 10,
- neigh_node->addr,
- neigh_node->if_incoming->net_dev->name);
-
- batadv_v_orig_print_neigh(orig_node, if_outgoing, seq);
- seq_putc(seq, '\n');
- batman_count++;
-
-next:
- batadv_neigh_node_put(neigh_node);
- if (n_ifinfo)
- batadv_neigh_ifinfo_put(n_ifinfo);
- }
- rcu_read_unlock();
- }
-
- if (batman_count == 0)
- seq_puts(seq, "No batman nodes in range ...\n");
-}
-#endif
-
/**
* batadv_v_orig_dump_subentry() - Dump an originator subentry into a message
* @msg: Netlink message to dump into
@@ -685,13 +529,6 @@ static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv,
return count;
}
-static ssize_t batadv_v_show_sel_class(struct batadv_priv *bat_priv, char *buff)
-{
- u32 class = atomic_read(&bat_priv->gw.sel_class);
-
- return sprintf(buff, "%u.%u MBit\n", class / 10, class % 10);
-}
-
/**
* batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW
* @gw_node: the GW to retrieve the metric for
@@ -829,78 +666,6 @@ out:
return ret;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/* fails if orig_node has no router */
-static int batadv_v_gw_write_buffer_text(struct batadv_priv *bat_priv,
- struct seq_file *seq,
- const struct batadv_gw_node *gw_node)
-{
- struct batadv_gw_node *curr_gw;
- struct batadv_neigh_node *router;
- struct batadv_neigh_ifinfo *router_ifinfo = NULL;
- int ret = -1;
-
- router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
- if (!router)
- goto out;
-
- router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
- if (!router_ifinfo)
- goto out;
-
- curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
-
- seq_printf(seq, "%s %pM (%9u.%1u) %pM [%10s]: %u.%u/%u.%u MBit\n",
- (curr_gw == gw_node ? "=>" : " "),
- gw_node->orig_node->orig,
- router_ifinfo->bat_v.throughput / 10,
- router_ifinfo->bat_v.throughput % 10, router->addr,
- router->if_incoming->net_dev->name,
- gw_node->bandwidth_down / 10,
- gw_node->bandwidth_down % 10,
- gw_node->bandwidth_up / 10,
- gw_node->bandwidth_up % 10);
- ret = seq_has_overflowed(seq) ? -1 : 0;
-
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
-out:
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- if (router)
- batadv_neigh_node_put(router);
- return ret;
-}
-
-/**
- * batadv_v_gw_print() - print the gateway list
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: gateway table seq_file struct
- */
-static void batadv_v_gw_print(struct batadv_priv *bat_priv,
- struct seq_file *seq)
-{
- struct batadv_gw_node *gw_node;
- int gw_count = 0;
-
- seq_puts(seq,
- " Gateway ( throughput) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
- /* fails if orig_node has no router */
- if (batadv_v_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
- continue;
-
- gw_count++;
- }
- rcu_read_unlock();
-
- if (gw_count == 0)
- seq_puts(seq, "No gateways in range ...\n");
-}
-#endif
-
/**
* batadv_v_gw_dump_entry() - Dump a gateway into a message
* @msg: Netlink message to dump into
@@ -1046,26 +811,16 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
.hardif_init = batadv_v_hardif_neigh_init,
.cmp = batadv_v_neigh_cmp,
.is_similar_or_better = batadv_v_neigh_is_sob,
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_v_neigh_print,
-#endif
.dump = batadv_v_neigh_dump,
},
.orig = {
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_v_orig_print,
-#endif
.dump = batadv_v_orig_dump,
},
.gw = {
.init_sel_class = batadv_v_init_sel_class,
.store_sel_class = batadv_v_store_sel_class,
- .show_sel_class = batadv_v_show_sel_class,
.get_best_gw_node = batadv_v_gw_get_best_gw_node,
.is_eligible = batadv_v_gw_is_eligible,
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_v_gw_print,
-#endif
.dump = batadv_v_gw_dump,
},
};
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index d35aca0e969a..0512ea6cd818 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -18,8 +18,10 @@
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/kref.h>
+#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/nl80211.h>
+#include <linux/prandom.h>
#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 717fe657561d..798d659855d0 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -18,8 +18,10 @@
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
+#include <linux/minmax.h>
#include <linux/mutex.h>
#include <linux/netdevice.h>
+#include <linux/prandom.h>
#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index c350ab63cd54..d2de12e527ba 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -28,7 +28,6 @@
#include <linux/preempt.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -1863,7 +1862,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb,
ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work);
- /* backbone_gw is unreferenced in the report work function function
+ /* backbone_gw is unreferenced in the report work function
* if queue_work() call was successful
*/
if (!ret)
@@ -2115,69 +2114,6 @@ out:
return ret;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_bla_claim_table_seq_print_text() - print the claim table in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->bla.claim_hash;
- struct batadv_bla_backbone_gw *backbone_gw;
- struct batadv_bla_claim *claim;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- u16 backbone_crc;
- u32 i;
- bool is_own;
- u8 *primary_addr;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- primary_addr = primary_if->net_dev->dev_addr;
- seq_printf(seq,
- "Claims announced for the mesh %s (orig %pM, group id %#.4x)\n",
- net_dev->name, primary_addr,
- ntohs(bat_priv->bla.claim_dest.group));
- seq_puts(seq,
- " Client VID Originator [o] (CRC )\n");
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(claim, head, hash_entry) {
- backbone_gw = batadv_bla_claim_get_backbone_gw(claim);
-
- is_own = batadv_compare_eth(backbone_gw->orig,
- primary_addr);
-
- spin_lock_bh(&backbone_gw->crc_lock);
- backbone_crc = backbone_gw->crc;
- spin_unlock_bh(&backbone_gw->crc_lock);
- seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n",
- claim->addr, batadv_print_vid(claim->vid),
- backbone_gw->orig,
- (is_own ? 'x' : ' '),
- backbone_crc);
-
- batadv_backbone_gw_put(backbone_gw);
- }
- rcu_read_unlock();
- }
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-#endif
-
/**
* batadv_bla_claim_dump_entry() - dump one entry of the claim table
* to a netlink socket
@@ -2348,72 +2284,6 @@ out:
return ret;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_bla_backbone_table_seq_print_text() - print the backbone table in a
- * seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->bla.backbone_hash;
- struct batadv_bla_backbone_gw *backbone_gw;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- int secs, msecs;
- u16 backbone_crc;
- u32 i;
- bool is_own;
- u8 *primary_addr;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- primary_addr = primary_if->net_dev->dev_addr;
- seq_printf(seq,
- "Backbones announced for the mesh %s (orig %pM, group id %#.4x)\n",
- net_dev->name, primary_addr,
- ntohs(bat_priv->bla.claim_dest.group));
- seq_puts(seq, " Originator VID last seen (CRC )\n");
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
- msecs = jiffies_to_msecs(jiffies -
- backbone_gw->lasttime);
- secs = msecs / 1000;
- msecs = msecs % 1000;
-
- is_own = batadv_compare_eth(backbone_gw->orig,
- primary_addr);
- if (is_own)
- continue;
-
- spin_lock_bh(&backbone_gw->crc_lock);
- backbone_crc = backbone_gw->crc;
- spin_unlock_bh(&backbone_gw->crc_lock);
-
- seq_printf(seq, " * %pM on %5d %4i.%03is (%#.4x)\n",
- backbone_gw->orig,
- batadv_print_vid(backbone_gw->vid), secs,
- msecs, backbone_crc);
- }
- rcu_read_unlock();
- }
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-#endif
-
/**
* batadv_bla_backbone_dump_entry() - dump one entry of the backbone table to a
* netlink socket
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index a81c41b636f9..7dc6d3571925 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -12,7 +12,6 @@
#include <linux/compiler.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/types.h>
@@ -41,10 +40,7 @@ bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
int hdr_size);
-int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset);
int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
-int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
- void *offset);
int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb);
bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
unsigned short vid);
@@ -84,18 +80,6 @@ static inline bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
return false;
}
-static inline int batadv_bla_claim_table_seq_print_text(struct seq_file *seq,
- void *offset)
-{
- return 0;
-}
-
-static inline int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
- void *offset)
-{
- return 0;
-}
-
static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv,
u8 *orig, unsigned short vid)
{
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
deleted file mode 100644
index 452856c27d20..000000000000
--- a/net/batman-adv/debugfs.c
+++ /dev/null
@@ -1,442 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- */
-
-#include "debugfs.h"
-#include "main.h"
-
-#include <asm/current.h>
-#include <linux/dcache.h>
-#include <linux/debugfs.h>
-#include <linux/errno.h>
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/netdevice.h>
-#include <linux/printk.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/stat.h>
-#include <linux/stddef.h>
-#include <linux/stringify.h>
-#include <linux/sysfs.h>
-#include <net/net_namespace.h>
-
-#include "bat_algo.h"
-#include "bridge_loop_avoidance.h"
-#include "distributed-arp-table.h"
-#include "gateway_client.h"
-#include "icmp_socket.h"
-#include "log.h"
-#include "multicast.h"
-#include "network-coding.h"
-#include "originator.h"
-#include "translation-table.h"
-
-static struct dentry *batadv_debugfs;
-
-/**
- * batadv_debugfs_deprecated() - Log use of deprecated batadv debugfs access
- * @file: file which was accessed
- * @alt: explanation what can be used as alternative
- */
-void batadv_debugfs_deprecated(struct file *file, const char *alt)
-{
- struct dentry *dentry = file_dentry(file);
- const char *name = dentry->d_name.name;
-
- pr_warn_ratelimited(DEPRECATED "%s (pid %d) Use of debugfs file \"%s\".\n%s",
- current->comm, task_pid_nr(current), name, alt);
-}
-
-static int batadv_algorithms_open(struct inode *inode, struct file *file)
-{
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_ROUTING_ALGOS instead\n");
- return single_open(file, batadv_algo_seq_print_text, NULL);
-}
-
-static int neighbors_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_NEIGHBORS instead\n");
- return single_open(file, batadv_hardif_neigh_seq_print_text, net_dev);
-}
-
-static int batadv_originators_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_ORIGINATORS instead\n");
- return single_open(file, batadv_orig_seq_print_text, net_dev);
-}
-
-/**
- * batadv_originators_hardif_open() - handles debugfs output for the originator
- * table of an hard interface
- * @inode: inode pointer to debugfs file
- * @file: pointer to the seq_file
- *
- * Return: 0 on success or negative error number in case of failure
- */
-static int batadv_originators_hardif_open(struct inode *inode,
- struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_HARDIFS instead\n");
- return single_open(file, batadv_orig_hardif_seq_print_text, net_dev);
-}
-
-static int batadv_gateways_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_GATEWAYS instead\n");
- return single_open(file, batadv_gw_client_seq_print_text, net_dev);
-}
-
-static int batadv_transtable_global_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_TRANSTABLE_GLOBAL instead\n");
- return single_open(file, batadv_tt_global_seq_print_text, net_dev);
-}
-
-#ifdef CONFIG_BATMAN_ADV_BLA
-static int batadv_bla_claim_table_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_BLA_CLAIM instead\n");
- return single_open(file, batadv_bla_claim_table_seq_print_text,
- net_dev);
-}
-
-static int batadv_bla_backbone_table_open(struct inode *inode,
- struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_BLA_BACKBONE instead\n");
- return single_open(file, batadv_bla_backbone_table_seq_print_text,
- net_dev);
-}
-
-#endif
-
-#ifdef CONFIG_BATMAN_ADV_DAT
-/**
- * batadv_dat_cache_open() - Prepare file handler for reads from dat_cache
- * @inode: inode which was opened
- * @file: file handle to be initialized
- *
- * Return: 0 on success or negative error number in case of failure
- */
-static int batadv_dat_cache_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_DAT_CACHE instead\n");
- return single_open(file, batadv_dat_cache_seq_print_text, net_dev);
-}
-#endif
-
-static int batadv_transtable_local_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_TRANSTABLE_LOCAL instead\n");
- return single_open(file, batadv_tt_local_seq_print_text, net_dev);
-}
-
-struct batadv_debuginfo {
- struct attribute attr;
- const struct file_operations fops;
-};
-
-#ifdef CONFIG_BATMAN_ADV_NC
-static int batadv_nc_nodes_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file, "");
- return single_open(file, batadv_nc_nodes_seq_print_text, net_dev);
-}
-#endif
-
-#ifdef CONFIG_BATMAN_ADV_MCAST
-/**
- * batadv_mcast_flags_open() - prepare file handler for reads from mcast_flags
- * @inode: inode which was opened
- * @file: file handle to be initialized
- *
- * Return: 0 on success or negative error number in case of failure
- */
-static int batadv_mcast_flags_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- batadv_debugfs_deprecated(file,
- "Use genl command BATADV_CMD_GET_MCAST_FLAGS instead\n");
- return single_open(file, batadv_mcast_flags_seq_print_text, net_dev);
-}
-#endif
-
-#define BATADV_DEBUGINFO(_name, _mode, _open) \
-struct batadv_debuginfo batadv_debuginfo_##_name = { \
- .attr = { \
- .name = __stringify(_name), \
- .mode = _mode, \
- }, \
- .fops = { \
- .owner = THIS_MODULE, \
- .open = _open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
- }, \
-}
-
-/* the following attributes are general and therefore they will be directly
- * placed in the BATADV_DEBUGFS_SUBDIR subdirectory of debugfs
- */
-static BATADV_DEBUGINFO(routing_algos, 0444, batadv_algorithms_open);
-
-static struct batadv_debuginfo *batadv_general_debuginfos[] = {
- &batadv_debuginfo_routing_algos,
- NULL,
-};
-
-/* The following attributes are per soft interface */
-static BATADV_DEBUGINFO(neighbors, 0444, neighbors_open);
-static BATADV_DEBUGINFO(originators, 0444, batadv_originators_open);
-static BATADV_DEBUGINFO(gateways, 0444, batadv_gateways_open);
-static BATADV_DEBUGINFO(transtable_global, 0444, batadv_transtable_global_open);
-#ifdef CONFIG_BATMAN_ADV_BLA
-static BATADV_DEBUGINFO(bla_claim_table, 0444, batadv_bla_claim_table_open);
-static BATADV_DEBUGINFO(bla_backbone_table, 0444,
- batadv_bla_backbone_table_open);
-#endif
-#ifdef CONFIG_BATMAN_ADV_DAT
-static BATADV_DEBUGINFO(dat_cache, 0444, batadv_dat_cache_open);
-#endif
-static BATADV_DEBUGINFO(transtable_local, 0444, batadv_transtable_local_open);
-#ifdef CONFIG_BATMAN_ADV_NC
-static BATADV_DEBUGINFO(nc_nodes, 0444, batadv_nc_nodes_open);
-#endif
-#ifdef CONFIG_BATMAN_ADV_MCAST
-static BATADV_DEBUGINFO(mcast_flags, 0444, batadv_mcast_flags_open);
-#endif
-
-static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
- &batadv_debuginfo_neighbors,
- &batadv_debuginfo_originators,
- &batadv_debuginfo_gateways,
- &batadv_debuginfo_transtable_global,
-#ifdef CONFIG_BATMAN_ADV_BLA
- &batadv_debuginfo_bla_claim_table,
- &batadv_debuginfo_bla_backbone_table,
-#endif
-#ifdef CONFIG_BATMAN_ADV_DAT
- &batadv_debuginfo_dat_cache,
-#endif
- &batadv_debuginfo_transtable_local,
-#ifdef CONFIG_BATMAN_ADV_NC
- &batadv_debuginfo_nc_nodes,
-#endif
-#ifdef CONFIG_BATMAN_ADV_MCAST
- &batadv_debuginfo_mcast_flags,
-#endif
- NULL,
-};
-
-#define BATADV_HARDIF_DEBUGINFO(_name, _mode, _open) \
-struct batadv_debuginfo batadv_hardif_debuginfo_##_name = { \
- .attr = { \
- .name = __stringify(_name), \
- .mode = _mode, \
- }, \
- .fops = { \
- .owner = THIS_MODULE, \
- .open = _open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
- }, \
-}
-
-static BATADV_HARDIF_DEBUGINFO(originators, 0444,
- batadv_originators_hardif_open);
-
-static struct batadv_debuginfo *batadv_hardif_debuginfos[] = {
- &batadv_hardif_debuginfo_originators,
- NULL,
-};
-
-/**
- * batadv_debugfs_init() - Initialize soft interface independent debugfs entries
- */
-void batadv_debugfs_init(void)
-{
- struct batadv_debuginfo **bat_debug;
-
- batadv_debugfs = debugfs_create_dir(BATADV_DEBUGFS_SUBDIR, NULL);
-
- for (bat_debug = batadv_general_debuginfos; *bat_debug; ++bat_debug)
- debugfs_create_file(((*bat_debug)->attr).name,
- S_IFREG | ((*bat_debug)->attr).mode,
- batadv_debugfs, NULL, &(*bat_debug)->fops);
-}
-
-/**
- * batadv_debugfs_destroy() - Remove all debugfs entries
- */
-void batadv_debugfs_destroy(void)
-{
- debugfs_remove_recursive(batadv_debugfs);
- batadv_debugfs = NULL;
-}
-
-/**
- * batadv_debugfs_add_hardif() - creates the base directory for a hard interface
- * in debugfs.
- * @hard_iface: hard interface which should be added.
- */
-void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
-{
- struct net *net = dev_net(hard_iface->net_dev);
- struct batadv_debuginfo **bat_debug;
-
- if (net != &init_net)
- return;
-
- hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name,
- batadv_debugfs);
-
- for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug)
- debugfs_create_file(((*bat_debug)->attr).name,
- S_IFREG | ((*bat_debug)->attr).mode,
- hard_iface->debug_dir, hard_iface->net_dev,
- &(*bat_debug)->fops);
-}
-
-/**
- * batadv_debugfs_rename_hardif() - Fix debugfs path for renamed hardif
- * @hard_iface: hard interface which was renamed
- */
-void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
-{
- const char *name = hard_iface->net_dev->name;
- struct dentry *dir;
-
- dir = hard_iface->debug_dir;
- if (!dir)
- return;
-
- debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
-}
-
-/**
- * batadv_debugfs_del_hardif() - delete the base directory for a hard interface
- * in debugfs.
- * @hard_iface: hard interface which is deleted.
- */
-void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
-{
- struct net *net = dev_net(hard_iface->net_dev);
-
- if (net != &init_net)
- return;
-
- if (batadv_debugfs) {
- debugfs_remove_recursive(hard_iface->debug_dir);
- hard_iface->debug_dir = NULL;
- }
-}
-
-/**
- * batadv_debugfs_add_meshif() - Initialize interface dependent debugfs entries
- * @dev: netdev struct of the soft interface
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_debugfs_add_meshif(struct net_device *dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_debuginfo **bat_debug;
- struct net *net = dev_net(dev);
-
- if (net != &init_net)
- return 0;
-
- bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs);
-
- batadv_socket_setup(bat_priv);
-
- if (batadv_debug_log_setup(bat_priv) < 0)
- goto rem_attr;
-
- for (bat_debug = batadv_mesh_debuginfos; *bat_debug; ++bat_debug)
- debugfs_create_file(((*bat_debug)->attr).name,
- S_IFREG | ((*bat_debug)->attr).mode,
- bat_priv->debug_dir, dev,
- &(*bat_debug)->fops);
-
- batadv_nc_init_debugfs(bat_priv);
-
- return 0;
-rem_attr:
- debugfs_remove_recursive(bat_priv->debug_dir);
- bat_priv->debug_dir = NULL;
- return -ENOMEM;
-}
-
-/**
- * batadv_debugfs_rename_meshif() - Fix debugfs path for renamed softif
- * @dev: net_device which was renamed
- */
-void batadv_debugfs_rename_meshif(struct net_device *dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(dev);
- const char *name = dev->name;
- struct dentry *dir;
-
- dir = bat_priv->debug_dir;
- if (!dir)
- return;
-
- debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
-}
-
-/**
- * batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries
- * @dev: netdev struct of the soft interface
- */
-void batadv_debugfs_del_meshif(struct net_device *dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(dev);
- struct net *net = dev_net(dev);
-
- if (net != &init_net)
- return;
-
- batadv_debug_log_cleanup(bat_priv);
-
- if (batadv_debugfs) {
- debugfs_remove_recursive(bat_priv->debug_dir);
- bat_priv->debug_dir = NULL;
- }
-}
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
deleted file mode 100644
index 7e2e8f586f42..000000000000
--- a/net/batman-adv/debugfs.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- */
-
-#ifndef _NET_BATMAN_ADV_DEBUGFS_H_
-#define _NET_BATMAN_ADV_DEBUGFS_H_
-
-#include "main.h"
-
-#include <linux/fs.h>
-#include <linux/netdevice.h>
-
-#define BATADV_DEBUGFS_SUBDIR "batman_adv"
-
-#if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS)
-
-void batadv_debugfs_deprecated(struct file *file, const char *alt);
-void batadv_debugfs_init(void);
-void batadv_debugfs_destroy(void);
-int batadv_debugfs_add_meshif(struct net_device *dev);
-void batadv_debugfs_rename_meshif(struct net_device *dev);
-void batadv_debugfs_del_meshif(struct net_device *dev);
-void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface);
-void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface);
-void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface);
-
-#else
-
-static inline void batadv_debugfs_deprecated(struct file *file, const char *alt)
-{
-}
-
-static inline void batadv_debugfs_init(void)
-{
-}
-
-static inline void batadv_debugfs_destroy(void)
-{
-}
-
-static inline int batadv_debugfs_add_meshif(struct net_device *dev)
-{
- return 0;
-}
-
-static inline void batadv_debugfs_rename_meshif(struct net_device *dev)
-{
-}
-
-static inline void batadv_debugfs_del_meshif(struct net_device *dev)
-{
-}
-
-static inline
-void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
-{
-}
-
-static inline
-void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
-{
-}
-
-static inline
-void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
-{
-}
-
-#endif
-
-#endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 0e6e53e9b5f3..fd7ba6bbdf85 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -26,7 +26,6 @@
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -842,60 +841,6 @@ void batadv_dat_free(struct batadv_priv *bat_priv)
batadv_dat_hash_free(bat_priv);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_dat_cache_seq_print_text() - print the local DAT hash table
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->dat.hash;
- struct batadv_dat_entry *dat_entry;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- unsigned long last_seen_jiffies;
- int last_seen_msecs, last_seen_secs, last_seen_mins;
- u32 i;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- seq_printf(seq, "Distributed ARP Table (%s):\n", net_dev->name);
- seq_puts(seq,
- " IPv4 MAC VID last-seen\n");
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(dat_entry, head, hash_entry) {
- last_seen_jiffies = jiffies - dat_entry->last_update;
- last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
- last_seen_mins = last_seen_msecs / 60000;
- last_seen_msecs = last_seen_msecs % 60000;
- last_seen_secs = last_seen_msecs / 1000;
-
- seq_printf(seq, " * %15pI4 %pM %4i %6i:%02i\n",
- &dat_entry->ip, dat_entry->mac_addr,
- batadv_print_vid(dat_entry->vid),
- last_seen_mins, last_seen_secs);
- }
- rcu_read_unlock();
- }
-
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-#endif
-
/**
* batadv_dat_cache_dump_entry() - dump one entry of the DAT cache table to a
* netlink socket
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 4e031661682a..e980fb45693a 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -12,7 +12,6 @@
#include <linux/compiler.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
@@ -74,7 +73,6 @@ batadv_dat_init_own_addr(struct batadv_priv *bat_priv,
int batadv_dat_init(struct batadv_priv *bat_priv);
void batadv_dat_free(struct batadv_priv *bat_priv);
-int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset);
int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb);
/**
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 9fdbe3068153..e522f1fcfd9a 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -14,8 +14,8 @@
#include <linux/gfp.h>
#include <linux/if_ether.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/lockdep.h>
+#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
@@ -27,7 +27,6 @@
#include "originator.h"
#include "routing.h"
#include "send.h"
-#include "soft-interface.h"
/**
* batadv_frag_clear_chain() - delete entries in the fragment buffer chain
@@ -306,7 +305,7 @@ free:
* set *skb to merged packet; 2) Packet is buffered: Return true and set *skb
* to NULL; 3) Error: Return false and free skb.
*
- * Return: true when the packet is merged or buffered, false when skb is not not
+ * Return: true when the packet is merged or buffered, false when skb is not
* used.
*/
bool batadv_frag_skb_buffer(struct sk_buff **skb,
@@ -391,6 +390,7 @@ out:
/**
* batadv_frag_create() - create a fragment from skb
+ * @net_dev: outgoing device for fragment
* @skb: skb to create fragment from
* @frag_head: header to use in new fragment
* @fragment_size: size of new fragment
@@ -401,22 +401,25 @@ out:
*
* Return: the new fragment, NULL on error.
*/
-static struct sk_buff *batadv_frag_create(struct sk_buff *skb,
+static struct sk_buff *batadv_frag_create(struct net_device *net_dev,
+ struct sk_buff *skb,
struct batadv_frag_packet *frag_head,
unsigned int fragment_size)
{
+ unsigned int ll_reserved = LL_RESERVED_SPACE(net_dev);
+ unsigned int tailroom = net_dev->needed_tailroom;
struct sk_buff *skb_fragment;
unsigned int header_size = sizeof(*frag_head);
unsigned int mtu = fragment_size + header_size;
- skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN);
+ skb_fragment = dev_alloc_skb(ll_reserved + mtu + tailroom);
if (!skb_fragment)
goto err;
skb_fragment->priority = skb->priority;
/* Eat the last mtu-bytes of the skb */
- skb_reserve(skb_fragment, header_size + ETH_HLEN);
+ skb_reserve(skb_fragment, ll_reserved + header_size);
skb_split(skb, skb_fragment, skb->len - fragment_size);
/* Add the header */
@@ -439,11 +442,12 @@ int batadv_frag_send_packet(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
struct batadv_neigh_node *neigh_node)
{
+ struct net_device *net_dev = neigh_node->if_incoming->net_dev;
struct batadv_priv *bat_priv;
struct batadv_hard_iface *primary_if = NULL;
struct batadv_frag_packet frag_header;
struct sk_buff *skb_fragment;
- unsigned int mtu = neigh_node->if_incoming->net_dev->mtu;
+ unsigned int mtu = net_dev->mtu;
unsigned int header_size = sizeof(frag_header);
unsigned int max_fragment_size, num_fragments;
int ret;
@@ -503,7 +507,7 @@ int batadv_frag_send_packet(struct sk_buff *skb,
goto put_primary_if;
}
- skb_fragment = batadv_frag_create(skb, &frag_header,
+ skb_fragment = batadv_frag_create(net_dev, skb, &frag_header,
max_fragment_size);
if (!skb_fragment) {
ret = -ENOMEM;
@@ -522,13 +526,14 @@ int batadv_frag_send_packet(struct sk_buff *skb,
frag_header.no++;
}
- /* Make room for the fragment header. */
- if (batadv_skb_head_push(skb, header_size) < 0 ||
- pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) {
- ret = -ENOMEM;
+ /* make sure that there is at least enough head for the fragmentation
+ * and ethernet headers
+ */
+ ret = skb_cow_head(skb, ETH_HLEN + header_size);
+ if (ret < 0)
goto put_primary_if;
- }
+ skb_push(skb, header_size);
memcpy(skb->data, &frag_header, header_size);
/* Send the last fragment */
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index ef3f85b576c4..cffe72f4edd7 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -25,7 +25,6 @@
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -511,44 +510,6 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
spin_unlock_bh(&bat_priv->gw.list_lock);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-/**
- * batadv_gw_client_seq_print_text() - Print the gateway table in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- return 0;
-
- seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
- BATADV_SOURCE_VERSION, primary_if->net_dev->name,
- primary_if->net_dev->dev_addr, net_dev->name,
- bat_priv->algo_ops->name);
-
- batadv_hardif_put(primary_if);
-
- if (!bat_priv->algo_ops->gw.print) {
- seq_puts(seq,
- "No printing function for this routing protocol\n");
- return 0;
- }
-
- bat_priv->algo_ops->gw.print(bat_priv, seq);
-
- return 0;
-}
-#endif
-
/**
* batadv_gw_dump() - Dump gateways into a message
* @msg: Netlink message to dump into
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 88b5dba84354..2fbc500f0ac1 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -10,7 +10,6 @@
#include "main.h"
#include <linux/netlink.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
@@ -31,7 +30,6 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv);
void batadv_gw_node_put(struct batadv_gw_node *gw_node);
struct batadv_gw_node *
batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv);
-int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset);
int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb);
bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb);
enum batadv_dhcp_recipient
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index fa06b51c0144..0f186ddc15e3 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -18,6 +18,7 @@
#include <linux/kref.h>
#include <linux/limits.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
@@ -31,14 +32,12 @@
#include "bat_v.h"
#include "bridge_loop_avoidance.h"
-#include "debugfs.h"
#include "distributed-arp-table.h"
#include "gateway_client.h"
#include "log.h"
#include "originator.h"
#include "send.h"
#include "soft-interface.h"
-#include "sysfs.h"
#include "translation-table.h"
/**
@@ -554,6 +553,9 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface)
needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN);
needed_headroom += batadv_max_header_len();
+ /* fragmentation headers don't strip the unicast/... header */
+ needed_headroom += sizeof(struct batadv_frag_packet);
+
soft_iface->needed_headroom = needed_headroom;
soft_iface->needed_tailroom = lower_tailroom;
}
@@ -599,7 +601,7 @@ out:
/* report to the other components the maximum amount of bytes that
* batman-adv can send over the wire (without considering the payload
* overhead). For example, this value is used by TT to compute the
- * maximum local table table size
+ * maximum local table size
*/
atomic_set(&bat_priv->packet_size_max, min_mtu);
@@ -843,11 +845,8 @@ static size_t batadv_hardif_cnt(const struct net_device *soft_iface)
/**
* batadv_hardif_disable_interface() - Remove hard interface from soft interface
* @hard_iface: hard interface to be removed
- * @autodel: whether to delete soft interface when it doesn't contain any other
- * slave interfaces
*/
-void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
- enum batadv_hard_if_cleanup autodel)
+void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
struct batadv_hard_iface *primary_if = NULL;
@@ -885,13 +884,9 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface);
/* nobody uses this interface anymore */
- if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) {
+ if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1)
batadv_gw_check_client_stop(bat_priv);
- if (autodel == BATADV_IF_CLEANUP_AUTO)
- batadv_softif_destroy_sysfs(hard_iface->soft_iface);
- }
-
hard_iface->soft_iface = NULL;
batadv_hardif_put(hard_iface);
@@ -904,7 +899,6 @@ static struct batadv_hard_iface *
batadv_hardif_add_interface(struct net_device *net_dev)
{
struct batadv_hard_iface *hard_iface;
- int ret;
ASSERT_RTNL();
@@ -917,16 +911,10 @@ batadv_hardif_add_interface(struct net_device *net_dev)
if (!hard_iface)
goto release_dev;
- ret = batadv_sysfs_add_hardif(&hard_iface->hardif_obj, net_dev);
- if (ret)
- goto free_if;
-
hard_iface->net_dev = net_dev;
hard_iface->soft_iface = NULL;
hard_iface->if_status = BATADV_IF_NOT_IN_USE;
- batadv_debugfs_add_hardif(hard_iface);
-
INIT_LIST_HEAD(&hard_iface->list);
INIT_HLIST_HEAD(&hard_iface->neigh_list);
@@ -950,8 +938,6 @@ batadv_hardif_add_interface(struct net_device *net_dev)
return hard_iface;
-free_if:
- kfree(hard_iface);
release_dev:
dev_put(net_dev);
out:
@@ -964,36 +950,16 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface)
/* first deactivate interface */
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
- batadv_hardif_disable_interface(hard_iface,
- BATADV_IF_CLEANUP_KEEP);
+ batadv_hardif_disable_interface(hard_iface);
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
return;
hard_iface->if_status = BATADV_IF_TO_BE_REMOVED;
- batadv_debugfs_del_hardif(hard_iface);
- batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
batadv_hardif_put(hard_iface);
}
/**
- * batadv_hardif_remove_interfaces() - Remove all hard interfaces
- */
-void batadv_hardif_remove_interfaces(void)
-{
- struct batadv_hard_iface *hard_iface, *hard_iface_tmp;
-
- rtnl_lock();
- list_for_each_entry_safe(hard_iface, hard_iface_tmp,
- &batadv_hardif_list, list) {
- list_del_rcu(&hard_iface->list);
- batadv_hardif_generation++;
- batadv_hardif_remove_interface(hard_iface);
- }
- rtnl_unlock();
-}
-
-/**
* batadv_hard_if_event_softif() - Handle events for soft interfaces
* @event: NETDEV_* event to handle
* @net_dev: net_device which generated an event
@@ -1007,13 +973,9 @@ static int batadv_hard_if_event_softif(unsigned long event,
switch (event) {
case NETDEV_REGISTER:
- batadv_sysfs_add_meshif(net_dev);
bat_priv = netdev_priv(net_dev);
batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS);
break;
- case NETDEV_CHANGENAME:
- batadv_debugfs_rename_meshif(net_dev);
- break;
}
return NOTIFY_DONE;
@@ -1078,9 +1040,6 @@ static int batadv_hard_if_event(struct notifier_block *this,
if (batadv_is_wifi_hardif(hard_iface))
hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
break;
- case NETDEV_CHANGENAME:
- batadv_debugfs_rename_hardif(hard_iface);
- break;
default:
break;
}
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index bad2e50135e8..f4b8e9efef19 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -42,12 +42,6 @@ enum batadv_hard_if_state {
/** @BATADV_IF_TO_BE_ACTIVATED: interface is getting activated */
BATADV_IF_TO_BE_ACTIVATED,
-
- /**
- * @BATADV_IF_I_WANT_YOU: interface is queued up (using sysfs) for being
- * added as slave interface of a batman-adv soft interface
- */
- BATADV_IF_I_WANT_YOU,
};
/**
@@ -73,22 +67,6 @@ enum batadv_hard_if_bcast {
BATADV_HARDIF_BCAST_DUPORIG,
};
-/**
- * enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal
- */
-enum batadv_hard_if_cleanup {
- /**
- * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface
- */
- BATADV_IF_CLEANUP_KEEP,
-
- /**
- * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was
- * removed
- */
- BATADV_IF_CLEANUP_AUTO,
-};
-
extern struct notifier_block batadv_hard_if_notifier;
struct net_device *batadv_get_real_netdev(struct net_device *net_device);
@@ -98,9 +76,7 @@ struct batadv_hard_iface*
batadv_hardif_get_by_netdev(const struct net_device *net_dev);
int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
struct net *net, const char *iface_name);
-void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
- enum batadv_hard_if_cleanup autodel);
-void batadv_hardif_remove_interfaces(void);
+void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface);
int batadv_hardif_min_mtu(struct net_device *soft_iface);
void batadv_update_min_mtu(struct net_device *soft_iface);
void batadv_hardif_release(struct kref *ref);
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
deleted file mode 100644
index 8bdabc03b0b2..000000000000
--- a/net/batman-adv/icmp_socket.c
+++ /dev/null
@@ -1,392 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- */
-
-#include "icmp_socket.h"
-#include "main.h"
-
-#include <linux/atomic.h>
-#include <linux/compiler.h>
-#include <linux/debugfs.h>
-#include <linux/errno.h>
-#include <linux/etherdevice.h>
-#include <linux/eventpoll.h>
-#include <linux/export.h>
-#include <linux/fcntl.h>
-#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/if_ether.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/pkt_sched.h>
-#include <linux/poll.h>
-#include <linux/printk.h>
-#include <linux/sched.h> /* for linux/wait.h */
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/stddef.h>
-#include <linux/string.h>
-#include <linux/uaccess.h>
-#include <linux/wait.h>
-#include <uapi/linux/batadv_packet.h>
-
-#include "debugfs.h"
-#include "hard-interface.h"
-#include "log.h"
-#include "originator.h"
-#include "send.h"
-
-static struct batadv_socket_client *batadv_socket_client_hash[256];
-
-static void batadv_socket_add_packet(struct batadv_socket_client *socket_client,
- struct batadv_icmp_header *icmph,
- size_t icmp_len);
-
-/**
- * batadv_socket_init() - Initialize soft interface independent socket data
- */
-void batadv_socket_init(void)
-{
- memset(batadv_socket_client_hash, 0, sizeof(batadv_socket_client_hash));
-}
-
-static int batadv_socket_open(struct inode *inode, struct file *file)
-{
- unsigned int i;
- struct batadv_socket_client *socket_client;
-
- if (!try_module_get(THIS_MODULE))
- return -EBUSY;
-
- batadv_debugfs_deprecated(file, "");
-
- stream_open(inode, file);
-
- socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL);
- if (!socket_client) {
- module_put(THIS_MODULE);
- return -ENOMEM;
- }
-
- for (i = 0; i < ARRAY_SIZE(batadv_socket_client_hash); i++) {
- if (!batadv_socket_client_hash[i]) {
- batadv_socket_client_hash[i] = socket_client;
- break;
- }
- }
-
- if (i == ARRAY_SIZE(batadv_socket_client_hash)) {
- pr_err("Error - can't add another packet client: maximum number of clients reached\n");
- kfree(socket_client);
- module_put(THIS_MODULE);
- return -EXFULL;
- }
-
- INIT_LIST_HEAD(&socket_client->queue_list);
- socket_client->queue_len = 0;
- socket_client->index = i;
- socket_client->bat_priv = inode->i_private;
- spin_lock_init(&socket_client->lock);
- init_waitqueue_head(&socket_client->queue_wait);
-
- file->private_data = socket_client;
-
- return 0;
-}
-
-static int batadv_socket_release(struct inode *inode, struct file *file)
-{
- struct batadv_socket_client *client = file->private_data;
- struct batadv_socket_packet *packet, *tmp;
-
- spin_lock_bh(&client->lock);
-
- /* for all packets in the queue ... */
- list_for_each_entry_safe(packet, tmp, &client->queue_list, list) {
- list_del(&packet->list);
- kfree(packet);
- }
-
- batadv_socket_client_hash[client->index] = NULL;
- spin_unlock_bh(&client->lock);
-
- kfree(client);
- module_put(THIS_MODULE);
-
- return 0;
-}
-
-static ssize_t batadv_socket_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct batadv_socket_client *socket_client = file->private_data;
- struct batadv_socket_packet *socket_packet;
- size_t packet_len;
- int error;
-
- if ((file->f_flags & O_NONBLOCK) && socket_client->queue_len == 0)
- return -EAGAIN;
-
- if (!buf || count < sizeof(struct batadv_icmp_packet))
- return -EINVAL;
-
- error = wait_event_interruptible(socket_client->queue_wait,
- socket_client->queue_len);
-
- if (error)
- return error;
-
- spin_lock_bh(&socket_client->lock);
-
- socket_packet = list_first_entry(&socket_client->queue_list,
- struct batadv_socket_packet, list);
- list_del(&socket_packet->list);
- socket_client->queue_len--;
-
- spin_unlock_bh(&socket_client->lock);
-
- packet_len = min(count, socket_packet->icmp_len);
- error = copy_to_user(buf, &socket_packet->icmp_packet, packet_len);
-
- kfree(socket_packet);
-
- if (error)
- return -EFAULT;
-
- return packet_len;
-}
-
-static ssize_t batadv_socket_write(struct file *file, const char __user *buff,
- size_t len, loff_t *off)
-{
- struct batadv_socket_client *socket_client = file->private_data;
- struct batadv_priv *bat_priv = socket_client->bat_priv;
- struct batadv_hard_iface *primary_if = NULL;
- struct sk_buff *skb;
- struct batadv_icmp_packet_rr *icmp_packet_rr;
- struct batadv_icmp_header *icmp_header;
- struct batadv_orig_node *orig_node = NULL;
- struct batadv_neigh_node *neigh_node = NULL;
- size_t packet_len = sizeof(struct batadv_icmp_packet);
- u8 *addr;
-
- if (len < sizeof(struct batadv_icmp_header)) {
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Error - can't send packet from char device: invalid packet size\n");
- return -EINVAL;
- }
-
- primary_if = batadv_primary_if_get_selected(bat_priv);
-
- if (!primary_if) {
- len = -EFAULT;
- goto out;
- }
-
- if (len >= BATADV_ICMP_MAX_PACKET_SIZE)
- packet_len = BATADV_ICMP_MAX_PACKET_SIZE;
- else
- packet_len = len;
-
- skb = netdev_alloc_skb_ip_align(NULL, packet_len + ETH_HLEN);
- if (!skb) {
- len = -ENOMEM;
- goto out;
- }
-
- skb->priority = TC_PRIO_CONTROL;
- skb_reserve(skb, ETH_HLEN);
- icmp_header = skb_put(skb, packet_len);
-
- if (copy_from_user(icmp_header, buff, packet_len)) {
- len = -EFAULT;
- goto free_skb;
- }
-
- if (icmp_header->packet_type != BATADV_ICMP) {
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Error - can't send packet from char device: got bogus packet type (expected: BAT_ICMP)\n");
- len = -EINVAL;
- goto free_skb;
- }
-
- switch (icmp_header->msg_type) {
- case BATADV_ECHO_REQUEST:
- if (len < sizeof(struct batadv_icmp_packet)) {
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Error - can't send packet from char device: invalid packet size\n");
- len = -EINVAL;
- goto free_skb;
- }
-
- if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
- goto dst_unreach;
-
- orig_node = batadv_orig_hash_find(bat_priv, icmp_header->dst);
- if (!orig_node)
- goto dst_unreach;
-
- neigh_node = batadv_orig_router_get(orig_node,
- BATADV_IF_DEFAULT);
- if (!neigh_node)
- goto dst_unreach;
-
- if (!neigh_node->if_incoming)
- goto dst_unreach;
-
- if (neigh_node->if_incoming->if_status != BATADV_IF_ACTIVE)
- goto dst_unreach;
-
- icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmp_header;
- if (packet_len == sizeof(*icmp_packet_rr)) {
- addr = neigh_node->if_incoming->net_dev->dev_addr;
- ether_addr_copy(icmp_packet_rr->rr[0], addr);
- }
-
- break;
- default:
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Error - can't send packet from char device: got unknown message type\n");
- len = -EINVAL;
- goto free_skb;
- }
-
- icmp_header->uid = socket_client->index;
-
- if (icmp_header->version != BATADV_COMPAT_VERSION) {
- icmp_header->msg_type = BATADV_PARAMETER_PROBLEM;
- icmp_header->version = BATADV_COMPAT_VERSION;
- batadv_socket_add_packet(socket_client, icmp_header,
- packet_len);
- goto free_skb;
- }
-
- ether_addr_copy(icmp_header->orig, primary_if->net_dev->dev_addr);
-
- batadv_send_unicast_skb(skb, neigh_node);
- goto out;
-
-dst_unreach:
- icmp_header->msg_type = BATADV_DESTINATION_UNREACHABLE;
- batadv_socket_add_packet(socket_client, icmp_header, packet_len);
-free_skb:
- kfree_skb(skb);
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (neigh_node)
- batadv_neigh_node_put(neigh_node);
- if (orig_node)
- batadv_orig_node_put(orig_node);
- return len;
-}
-
-static __poll_t batadv_socket_poll(struct file *file, poll_table *wait)
-{
- struct batadv_socket_client *socket_client = file->private_data;
-
- poll_wait(file, &socket_client->queue_wait, wait);
-
- if (socket_client->queue_len > 0)
- return EPOLLIN | EPOLLRDNORM;
-
- return 0;
-}
-
-static const struct file_operations batadv_fops = {
- .owner = THIS_MODULE,
- .open = batadv_socket_open,
- .release = batadv_socket_release,
- .read = batadv_socket_read,
- .write = batadv_socket_write,
- .poll = batadv_socket_poll,
- .llseek = no_llseek,
-};
-
-/**
- * batadv_socket_setup() - Create debugfs "socket" file
- * @bat_priv: the bat priv with all the soft interface information
- */
-void batadv_socket_setup(struct batadv_priv *bat_priv)
-{
- debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir,
- bat_priv, &batadv_fops);
-}
-
-/**
- * batadv_socket_add_packet() - schedule an icmp packet to be sent to
- * userspace on an icmp socket.
- * @socket_client: the socket this packet belongs to
- * @icmph: pointer to the header of the icmp packet
- * @icmp_len: total length of the icmp packet
- */
-static void batadv_socket_add_packet(struct batadv_socket_client *socket_client,
- struct batadv_icmp_header *icmph,
- size_t icmp_len)
-{
- struct batadv_socket_packet *socket_packet;
- size_t len;
-
- socket_packet = kmalloc(sizeof(*socket_packet), GFP_ATOMIC);
-
- if (!socket_packet)
- return;
-
- len = icmp_len;
- /* check the maximum length before filling the buffer */
- if (len > sizeof(socket_packet->icmp_packet))
- len = sizeof(socket_packet->icmp_packet);
-
- INIT_LIST_HEAD(&socket_packet->list);
- memcpy(&socket_packet->icmp_packet, icmph, len);
- socket_packet->icmp_len = len;
-
- spin_lock_bh(&socket_client->lock);
-
- /* while waiting for the lock the socket_client could have been
- * deleted
- */
- if (!batadv_socket_client_hash[icmph->uid]) {
- spin_unlock_bh(&socket_client->lock);
- kfree(socket_packet);
- return;
- }
-
- list_add_tail(&socket_packet->list, &socket_client->queue_list);
- socket_client->queue_len++;
-
- if (socket_client->queue_len > 100) {
- socket_packet = list_first_entry(&socket_client->queue_list,
- struct batadv_socket_packet,
- list);
-
- list_del(&socket_packet->list);
- kfree(socket_packet);
- socket_client->queue_len--;
- }
-
- spin_unlock_bh(&socket_client->lock);
-
- wake_up(&socket_client->queue_wait);
-}
-
-/**
- * batadv_socket_receive_packet() - schedule an icmp packet to be received
- * locally and sent to userspace.
- * @icmph: pointer to the header of the icmp packet
- * @icmp_len: total length of the icmp packet
- */
-void batadv_socket_receive_packet(struct batadv_icmp_header *icmph,
- size_t icmp_len)
-{
- struct batadv_socket_client *hash;
-
- hash = batadv_socket_client_hash[icmph->uid];
- if (hash)
- batadv_socket_add_packet(hash, icmph, icmp_len);
-}
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
deleted file mode 100644
index 6abd0f4742ef..000000000000
--- a/net/batman-adv/icmp_socket.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- */
-
-#ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_
-#define _NET_BATMAN_ADV_ICMP_SOCKET_H_
-
-#include "main.h"
-
-#include <linux/types.h>
-#include <uapi/linux/batadv_packet.h>
-
-#define BATADV_ICMP_SOCKET "socket"
-
-void batadv_socket_setup(struct batadv_priv *bat_priv);
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-void batadv_socket_init(void);
-void batadv_socket_receive_packet(struct batadv_icmp_header *icmph,
- size_t icmp_len);
-
-#else
-
-static inline void batadv_socket_init(void)
-{
-}
-
-static inline void
-batadv_socket_receive_packet(struct batadv_icmp_header *icmph, size_t icmp_len)
-{
-}
-
-#endif
-
-#endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index a67b2b091447..b7e9923b11a2 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -7,213 +7,10 @@
#include "log.h"
#include "main.h"
-#include <linux/compiler.h>
-#include <linux/debugfs.h>
-#include <linux/errno.h>
-#include <linux/eventpoll.h>
-#include <linux/export.h>
-#include <linux/fcntl.h>
-#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/poll.h>
-#include <linux/sched.h> /* for linux/wait.h */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/uaccess.h>
-#include <linux/wait.h>
#include <stdarg.h>
-#include "debugfs.h"
#include "trace.h"
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1)
-
-static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN;
-
-static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log,
- size_t idx)
-{
- return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK];
-}
-
-static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log,
- char c)
-{
- char *char_addr;
-
- char_addr = batadv_log_char_addr(debug_log, debug_log->log_end);
- *char_addr = c;
- debug_log->log_end++;
-
- if (debug_log->log_end - debug_log->log_start > batadv_log_buff_len)
- debug_log->log_start = debug_log->log_end - batadv_log_buff_len;
-}
-
-__printf(2, 3)
-static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log,
- const char *fmt, ...)
-{
- va_list args;
- static char debug_log_buf[256];
- char *p;
-
- if (!debug_log)
- return 0;
-
- spin_lock_bh(&debug_log->lock);
- va_start(args, fmt);
- vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args);
- va_end(args);
-
- for (p = debug_log_buf; *p != 0; p++)
- batadv_emit_log_char(debug_log, *p);
-
- spin_unlock_bh(&debug_log->lock);
-
- wake_up(&debug_log->queue_wait);
-
- return 0;
-}
-
-static int batadv_log_open(struct inode *inode, struct file *file)
-{
- if (!try_module_get(THIS_MODULE))
- return -EBUSY;
-
- batadv_debugfs_deprecated(file,
- "Use tracepoint batadv:batadv_dbg instead\n");
-
- stream_open(inode, file);
- file->private_data = inode->i_private;
- return 0;
-}
-
-static int batadv_log_release(struct inode *inode, struct file *file)
-{
- module_put(THIS_MODULE);
- return 0;
-}
-
-static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log)
-{
- return !(debug_log->log_start - debug_log->log_end);
-}
-
-static ssize_t batadv_log_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct batadv_priv *bat_priv = file->private_data;
- struct batadv_priv_debug_log *debug_log = bat_priv->debug_log;
- int error, i = 0;
- char *char_addr;
- char c;
-
- if ((file->f_flags & O_NONBLOCK) && batadv_log_empty(debug_log))
- return -EAGAIN;
-
- if (!buf)
- return -EINVAL;
-
- if (count == 0)
- return 0;
-
- if (!access_ok(buf, count))
- return -EFAULT;
-
- error = wait_event_interruptible(debug_log->queue_wait,
- (!batadv_log_empty(debug_log)));
-
- if (error)
- return error;
-
- spin_lock_bh(&debug_log->lock);
-
- while ((!error) && (i < count) &&
- (debug_log->log_start != debug_log->log_end)) {
- char_addr = batadv_log_char_addr(debug_log,
- debug_log->log_start);
- c = *char_addr;
-
- debug_log->log_start++;
-
- spin_unlock_bh(&debug_log->lock);
-
- error = __put_user(c, buf);
-
- spin_lock_bh(&debug_log->lock);
-
- buf++;
- i++;
- }
-
- spin_unlock_bh(&debug_log->lock);
-
- if (!error)
- return i;
-
- return error;
-}
-
-static __poll_t batadv_log_poll(struct file *file, poll_table *wait)
-{
- struct batadv_priv *bat_priv = file->private_data;
- struct batadv_priv_debug_log *debug_log = bat_priv->debug_log;
-
- poll_wait(file, &debug_log->queue_wait, wait);
-
- if (!batadv_log_empty(debug_log))
- return EPOLLIN | EPOLLRDNORM;
-
- return 0;
-}
-
-static const struct file_operations batadv_log_fops = {
- .open = batadv_log_open,
- .release = batadv_log_release,
- .read = batadv_log_read,
- .poll = batadv_log_poll,
- .llseek = no_llseek,
-};
-
-/**
- * batadv_debug_log_setup() - Initialize debug log
- * @bat_priv: the bat priv with all the soft interface information
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_debug_log_setup(struct batadv_priv *bat_priv)
-{
- bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC);
- if (!bat_priv->debug_log)
- return -ENOMEM;
-
- spin_lock_init(&bat_priv->debug_log->lock);
- init_waitqueue_head(&bat_priv->debug_log->queue_wait);
-
- debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv,
- &batadv_log_fops);
- return 0;
-}
-
-/**
- * batadv_debug_log_cleanup() - Destroy debug log
- * @bat_priv: the bat priv with all the soft interface information
- */
-void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
-{
- kfree(bat_priv->debug_log);
- bat_priv->debug_log = NULL;
-}
-
-#endif /* CONFIG_BATMAN_ADV_DEBUGFS */
-
/**
* batadv_debug_log() - Add debug log entry
* @bat_priv: the bat priv with all the soft interface information
@@ -231,11 +28,6 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV",
- jiffies_to_msecs(jiffies), &vaf);
-#endif
-
trace_batadv_dbg(bat_priv, &vaf);
va_end(args);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 519c08c2cfba..ed9d87ce3407 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -23,12 +23,12 @@
#include <linux/kobject.h>
#include <linux/kref.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -44,12 +44,10 @@
#include "bat_iv_ogm.h"
#include "bat_v.h"
#include "bridge_loop_avoidance.h"
-#include "debugfs.h"
#include "distributed-arp-table.h"
#include "gateway_client.h"
#include "gateway_common.h"
#include "hard-interface.h"
-#include "icmp_socket.h"
#include "log.h"
#include "multicast.h"
#include "netlink.h"
@@ -113,9 +111,6 @@ static int __init batadv_init(void)
if (!batadv_event_workqueue)
goto err_create_wq;
- batadv_socket_init();
- batadv_debugfs_init();
-
register_netdevice_notifier(&batadv_hard_if_notifier);
rtnl_link_register(&batadv_link_ops);
batadv_netlink_register();
@@ -133,11 +128,9 @@ err_create_wq:
static void __exit batadv_exit(void)
{
- batadv_debugfs_destroy();
batadv_netlink_unregister();
rtnl_link_unregister(&batadv_link_ops);
unregister_netdevice_notifier(&batadv_hard_if_notifier);
- batadv_hardif_remove_interfaces();
flush_workqueue(batadv_event_workqueue);
destroy_workqueue(batadv_event_workqueue);
@@ -306,44 +299,6 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
return is_my_mac;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_seq_print_text_primary_if_get() - called from debugfs table printing
- * function that requires the primary interface
- * @seq: debugfs table seq_file struct
- *
- * Return: primary interface if found or NULL otherwise.
- */
-struct batadv_hard_iface *
-batadv_seq_print_text_primary_if_get(struct seq_file *seq)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
-
- primary_if = batadv_primary_if_get_selected(bat_priv);
-
- if (!primary_if) {
- seq_printf(seq,
- "BATMAN mesh %s disabled - please specify interfaces to enable it\n",
- net_dev->name);
- goto out;
- }
-
- if (primary_if->if_status == BATADV_IF_ACTIVE)
- goto out;
-
- seq_printf(seq,
- "BATMAN mesh %s disabled - primary interface not active\n",
- net_dev->name);
- batadv_hardif_put(primary_if);
- primary_if = NULL;
-
-out:
- return primary_if;
-}
-#endif
-
/**
* batadv_max_header_len() - calculate maximum encapsulation overhead for a
* payload packet
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 0393bb9ed3d0..288201630ceb 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -13,7 +13,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2020.3"
+#define BATADV_SOURCE_VERSION "2021.0"
#endif
/* B.A.T.M.A.N. parameters */
@@ -212,7 +212,6 @@ enum batadv_uev_type {
#include <linux/jiffies.h>
#include <linux/netdevice.h>
#include <linux/percpu.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
@@ -243,8 +242,6 @@ extern struct workqueue_struct *batadv_event_workqueue;
int batadv_mesh_init(struct net_device *soft_iface);
void batadv_mesh_free(struct net_device *soft_iface);
bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr);
-struct batadv_hard_iface *
-batadv_seq_print_text_primary_if_get(struct seq_file *seq);
int batadv_max_header_len(void);
void batadv_skb_set_priority(struct sk_buff *skb, int offset);
int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index ca24a2e522b7..854e5ff28a3f 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -33,7 +33,6 @@
#include <linux/printk.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -208,7 +207,7 @@ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv,
return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
/* TODO: ask the bridge if a multicast router is present (the bridge
- * is capable of performing proper RFC4286 multicast multicast router
+ * is capable of performing proper RFC4286 multicast router
* discovery) instead of searching for a ff02::2 listener here
*/
ret = br_multicast_list_adjacent(dev, &bridge_mcast_list);
@@ -221,7 +220,7 @@ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv,
* address here, only IPv6 ones
*/
if (br_ip_entry->addr.proto == htons(ETH_P_IPV6) &&
- ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.u.ip6))
+ ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.dst.ip6))
flags &= ~BATADV_MCAST_WANT_NO_RTR6;
list_del(&br_ip_entry->list);
@@ -562,10 +561,10 @@ out:
static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src)
{
if (src->proto == htons(ETH_P_IP))
- ip_eth_mc_map(src->u.ip4, dst);
+ ip_eth_mc_map(src->dst.ip4, dst);
#if IS_ENABLED(CONFIG_IPV6)
else if (src->proto == htons(ETH_P_IPV6))
- ipv6_eth_mc_map(&src->u.ip6, dst);
+ ipv6_eth_mc_map(&src->dst.ip6, dst);
#endif
else
eth_zero_addr(dst);
@@ -609,11 +608,11 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev,
continue;
if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
- ipv4_is_local_multicast(br_ip_entry->addr.u.ip4))
+ ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4))
continue;
if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) &&
- !ipv4_is_local_multicast(br_ip_entry->addr.u.ip4))
+ !ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4))
continue;
}
@@ -623,11 +622,11 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev,
continue;
if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
- ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.u.ip6))
+ ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.dst.ip6))
continue;
if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) &&
- IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.u.ip6) >
+ IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.dst.ip6) >
IPV6_ADDR_SCOPE_LINKLOCAL)
continue;
}
@@ -2074,116 +2073,6 @@ void batadv_mcast_init(struct batadv_priv *bat_priv)
batadv_mcast_start_timer(bat_priv);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_mcast_flags_print_header() - print own mcast flags to debugfs table
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: debugfs table seq_file struct
- *
- * Prints our own multicast flags including a more specific reason why
- * they are set, that is prints the bridge and querier state too, to
- * the debugfs table specified via @seq.
- */
-static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv,
- struct seq_file *seq)
-{
- struct batadv_mcast_mla_flags *mla_flags = &bat_priv->mcast.mla_flags;
- char querier4, querier6, shadowing4, shadowing6;
- bool bridged = mla_flags->bridged;
- u8 flags = mla_flags->tvlv_flags;
-
- if (bridged) {
- querier4 = mla_flags->querier_ipv4.exists ? '.' : '4';
- querier6 = mla_flags->querier_ipv6.exists ? '.' : '6';
- shadowing4 = mla_flags->querier_ipv4.shadowing ? '4' : '.';
- shadowing6 = mla_flags->querier_ipv6.shadowing ? '6' : '.';
- } else {
- querier4 = '?';
- querier6 = '?';
- shadowing4 = '?';
- shadowing6 = '?';
- }
-
- seq_printf(seq, "Multicast flags (own flags: [%c%c%c%s%s])\n",
- (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
- !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
- !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ");
- seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.');
- seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n",
- querier4, querier6);
- seq_printf(seq, "* Shadowing IGMP/MLD Querier [4/6]:\t%c/%c\n",
- shadowing4, shadowing6);
- seq_puts(seq, "-------------------------------------------\n");
- seq_printf(seq, " %-10s %s\n", "Originator", "Flags");
-}
-
-/**
- * batadv_mcast_flags_seq_print_text() - print the mcast flags of other nodes
- * @seq: seq file to print on
- * @offset: not used
- *
- * This prints a table of (primary) originators and their according
- * multicast flags, including (in the header) our own.
- *
- * Return: always 0
- */
-int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- struct batadv_orig_node *orig_node;
- struct hlist_head *head;
- u8 flags;
- u32 i;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- return 0;
-
- batadv_mcast_flags_print_header(bat_priv, seq);
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
- &orig_node->capa_initialized))
- continue;
-
- if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
- &orig_node->capabilities)) {
- seq_printf(seq, "%pM -\n", orig_node->orig);
- continue;
- }
-
- flags = orig_node->mcast_flags;
-
- seq_printf(seq, "%pM [%c%c%c%s%s]\n", orig_node->orig,
- (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)
- ? 'U' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV4)
- ? '4' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV6)
- ? '6' : '.',
- !(flags & BATADV_MCAST_WANT_NO_RTR4)
- ? "R4" : ". ",
- !(flags & BATADV_MCAST_WANT_NO_RTR6)
- ? "R6" : ". ");
- }
- rcu_read_unlock();
- }
-
- batadv_hardif_put(primary_if);
-
- return 0;
-}
-#endif
-
/**
* batadv_mcast_mesh_info_put() - put multicast info into a netlink message
* @msg: buffer for the message
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 3e114bc5ca3b..d61593d02072 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -10,7 +10,6 @@
#include "main.h"
#include <linux/netlink.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
/**
@@ -56,8 +55,6 @@ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb,
void batadv_mcast_init(struct batadv_priv *bat_priv);
-int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset);
-
int batadv_mcast_mesh_info_put(struct sk_buff *msg,
struct batadv_priv *bat_priv);
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index dc193618a761..97bcf149633d 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -23,6 +23,7 @@
#include <linux/kernel.h>
#include <linux/limits.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/printk.h>
@@ -1350,7 +1351,7 @@ static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
}
}
-static const struct genl_ops batadv_netlink_ops[] = {
+static const struct genl_small_ops batadv_netlink_ops[] = {
{
.cmd = BATADV_CMD_GET_MESH,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -1484,8 +1485,8 @@ struct genl_family batadv_netlink_family __ro_after_init = {
.pre_doit = batadv_pre_doit,
.post_doit = batadv_post_doit,
.module = THIS_MODULE,
- .ops = batadv_netlink_ops,
- .n_ops = ARRAY_SIZE(batadv_netlink_ops),
+ .small_ops = batadv_netlink_ops,
+ .n_small_ops = ARRAY_SIZE(batadv_netlink_ops),
.mcgrps = batadv_netlink_mcgrps,
.n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps),
};
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 48d707850f3e..0cec108b7a99 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -11,7 +11,6 @@
#include <linux/bitops.h>
#include <linux/byteorder/generic.h>
#include <linux/compiler.h>
-#include <linux/debugfs.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -26,11 +25,10 @@
#include <linux/lockdep.h>
#include <linux/net.h>
#include <linux/netdevice.h>
+#include <linux/prandom.h>
#include <linux/printk.h>
-#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -39,7 +37,6 @@
#include <linux/workqueue.h>
#include <uapi/linux/batadv_packet.h>
-#include "hard-interface.h"
#include "hash.h"
#include "log.h"
#include "originator.h"
@@ -250,7 +247,7 @@ static void batadv_nc_path_put(struct batadv_nc_path *nc_path)
/**
* batadv_nc_packet_free() - frees nc packet
* @nc_packet: the nc packet to free
- * @dropped: whether the packet is freed because is is dropped
+ * @dropped: whether the packet is freed because is dropped
*/
static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet,
bool dropped)
@@ -1876,87 +1873,3 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL);
batadv_hash_destroy(bat_priv->nc.decoding_hash);
}
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_nc_nodes_seq_print_text() - print the nc node information
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- struct batadv_orig_node *orig_node;
- struct batadv_nc_node *nc_node;
- int i;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- /* Traverse list of originators */
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- /* For each orig_node in this bin */
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- /* no need to print the orig node if it does not have
- * network coding neighbors
- */
- if (list_empty(&orig_node->in_coding_list) &&
- list_empty(&orig_node->out_coding_list))
- continue;
-
- seq_printf(seq, "Node: %pM\n", orig_node->orig);
-
- seq_puts(seq, " Ingoing: ");
- /* For each in_nc_node to this orig_node */
- list_for_each_entry_rcu(nc_node,
- &orig_node->in_coding_list,
- list)
- seq_printf(seq, "%pM ",
- nc_node->addr);
- seq_puts(seq, "\n Outgoing: ");
- /* For out_nc_node to this orig_node */
- list_for_each_entry_rcu(nc_node,
- &orig_node->out_coding_list,
- list)
- seq_printf(seq, "%pM ",
- nc_node->addr);
- seq_puts(seq, "\n\n");
- }
- rcu_read_unlock();
- }
-
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-
-/**
- * batadv_nc_init_debugfs() - create nc folder and related files in debugfs
- * @bat_priv: the bat priv with all the soft interface information
- */
-void batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
-{
- struct dentry *nc_dir;
-
- nc_dir = debugfs_create_dir("nc", bat_priv->debug_dir);
-
- debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq);
-
- debugfs_create_u32("max_fwd_delay", 0644, nc_dir,
- &bat_priv->nc.max_fwd_delay);
-
- debugfs_create_u32("max_buffer_time", 0644, nc_dir,
- &bat_priv->nc.max_buffer_time);
-}
-#endif
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index 334289084127..8fb2c01e7837 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -10,7 +10,6 @@
#include "main.h"
#include <linux/netdevice.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
@@ -38,8 +37,6 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
struct sk_buff *skb);
void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
struct sk_buff *skb);
-int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset);
-void batadv_nc_init_debugfs(struct batadv_priv *bat_priv);
#else /* ifdef CONFIG_BATMAN_ADV_NC */
@@ -104,16 +101,6 @@ batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
{
}
-static inline int batadv_nc_nodes_seq_print_text(struct seq_file *seq,
- void *offset)
-{
- return 0;
-}
-
-static inline void batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
-{
-}
-
#endif /* ifdef CONFIG_BATMAN_ADV_NC */
#endif /* _NET_BATMAN_ADV_NETWORK_CODING_H_ */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 805d8969bdfb..77431e59b228 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -20,7 +20,6 @@
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -733,42 +732,6 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_hardif_neigh_seq_print_text() - print the single hop neighbour list
- * @seq: neighbour table seq_file struct
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- return 0;
-
- seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
- BATADV_SOURCE_VERSION, primary_if->net_dev->name,
- primary_if->net_dev->dev_addr, net_dev->name,
- bat_priv->algo_ops->name);
-
- batadv_hardif_put(primary_if);
-
- if (!bat_priv->algo_ops->neigh.print) {
- seq_puts(seq,
- "No printing function for this routing protocol\n");
- return 0;
- }
-
- bat_priv->algo_ops->neigh.print(bat_priv, seq);
- return 0;
-}
-#endif
-
/**
* batadv_hardif_neigh_dump() - Dump to netlink the neighbor infos for a
* specific outgoing interface
@@ -1382,90 +1345,6 @@ static void batadv_purge_orig(struct work_struct *work)
msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-/**
- * batadv_orig_seq_print_text() - Print the originator table in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- return 0;
-
- seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
- BATADV_SOURCE_VERSION, primary_if->net_dev->name,
- primary_if->net_dev->dev_addr, net_dev->name,
- bat_priv->algo_ops->name);
-
- batadv_hardif_put(primary_if);
-
- if (!bat_priv->algo_ops->orig.print) {
- seq_puts(seq,
- "No printing function for this routing protocol\n");
- return 0;
- }
-
- bat_priv->algo_ops->orig.print(bat_priv, seq, BATADV_IF_DEFAULT);
-
- return 0;
-}
-
-/**
- * batadv_orig_hardif_seq_print_text() - writes originator infos for a specific
- * outgoing interface
- * @seq: debugfs table seq_file struct
- * @offset: not used
- *
- * Return: 0
- */
-int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_hard_iface *hard_iface;
- struct batadv_priv *bat_priv;
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
-
- if (!hard_iface || !hard_iface->soft_iface) {
- seq_puts(seq, "Interface not known to B.A.T.M.A.N.\n");
- goto out;
- }
-
- bat_priv = netdev_priv(hard_iface->soft_iface);
- if (!bat_priv->algo_ops->orig.print) {
- seq_puts(seq,
- "No printing function for this routing protocol\n");
- goto out;
- }
-
- if (hard_iface->if_status != BATADV_IF_ACTIVE) {
- seq_puts(seq, "Interface not active\n");
- goto out;
- }
-
- seq_printf(seq, "[B.A.T.M.A.N. adv %s, IF/MAC: %s/%pM (%s %s)]\n",
- BATADV_SOURCE_VERSION, hard_iface->net_dev->name,
- hard_iface->net_dev->dev_addr,
- hard_iface->soft_iface->name, bat_priv->algo_ops->name);
-
- bat_priv->algo_ops->orig.print(bat_priv, seq, hard_iface);
-
-out:
- if (hard_iface)
- batadv_hardif_put(hard_iface);
- return 0;
-}
-#endif
-
/**
* batadv_orig_dump() - Dump to netlink the originator infos for a specific
* outgoing interface
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 7bc01c138b3a..e75d4c4d11f5 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -13,7 +13,6 @@
#include <linux/if_ether.h>
#include <linux/jhash.h>
#include <linux/netlink.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/types.h>
@@ -46,7 +45,6 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo);
int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb);
-int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset);
struct batadv_orig_ifinfo *
batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
@@ -56,9 +54,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *if_outgoing);
void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo);
-int batadv_orig_seq_print_text(struct seq_file *seq, void *offset);
int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb);
-int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset);
struct batadv_orig_node_vlan *
batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
unsigned short vid);
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 9e5c71e406ff..49cbca4aa428 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -29,7 +29,6 @@
#include "distributed-arp-table.h"
#include "fragmentation.h"
#include "hard-interface.h"
-#include "icmp_socket.h"
#include "log.h"
#include "network-coding.h"
#include "originator.h"
@@ -227,15 +226,6 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
icmph = (struct batadv_icmp_header *)skb->data;
switch (icmph->msg_type) {
- case BATADV_ECHO_REPLY:
- case BATADV_DESTINATION_UNREACHABLE:
- case BATADV_TTL_EXCEEDED:
- /* receive the packet */
- if (skb_linearize(skb) < 0)
- break;
-
- batadv_socket_receive_packet(icmph, skb->len);
- break;
case BATADV_ECHO_REQUEST:
/* answer echo request (ping) */
primary_if = batadv_primary_if_get_selected(bat_priv);
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index d267b94800d6..87017332b567 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -461,7 +461,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
/**
* batadv_forw_packet_free() - free a forwarding packet
* @forw_packet: The packet to free
- * @dropped: whether the packet is freed because is is dropped
+ * @dropped: whether the packet is freed because is dropped
*
* This frees a forwarding packet and releases any resources it might
* have claimed.
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index cdde943c1b83..97118efbe678 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -30,7 +30,6 @@
#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/socket.h>
@@ -38,12 +37,12 @@
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <net/netlink.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
#include "bat_algo.h"
#include "bridge_loop_avoidance.h"
-#include "debugfs.h"
#include "distributed-arp-table.h"
#include "gateway_client.h"
#include "hard-interface.h"
@@ -51,7 +50,6 @@
#include "network-coding.h"
#include "originator.h"
#include "send.h"
-#include "sysfs.h"
#include "translation-table.h"
/**
@@ -574,7 +572,6 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
{
struct batadv_softif_vlan *vlan;
- int err;
spin_lock_bh(&bat_priv->softif_vlan_list_lock);
@@ -601,19 +598,6 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list);
spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
- /* batadv_sysfs_add_vlan cannot be in the spinlock section due to the
- * sleeping behavior of the sysfs functions and the fs_reclaim lock
- */
- err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan);
- if (err) {
- /* ref for the function */
- batadv_softif_vlan_put(vlan);
-
- /* ref for the list */
- batadv_softif_vlan_put(vlan);
- return err;
- }
-
/* add a new TT local entry. This one will be marked with the NOPURGE
* flag
*/
@@ -641,14 +625,13 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv,
batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr,
vlan->vid, "vlan interface destroyed", false);
- batadv_sysfs_del_vlan(bat_priv, vlan);
batadv_softif_vlan_put(vlan);
}
/**
* batadv_interface_add_vid() - ndo_add_vid API implementation
* @dev: the netdev of the mesh interface
- * @proto: protocol of the the vlan id
+ * @proto: protocol of the vlan id
* @vid: identifier of the new vlan
*
* Set up all the internal structures for handling the new vlan on top of the
@@ -661,7 +644,6 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
{
struct batadv_priv *bat_priv = netdev_priv(dev);
struct batadv_softif_vlan *vlan;
- int ret;
/* only 802.1Q vlans are supported.
* batman-adv does not know how to handle other types
@@ -681,17 +663,6 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
if (!vlan)
return batadv_softif_create_vlan(bat_priv, vid);
- /* recreate the sysfs object if it was already destroyed (and it should
- * be since we received a kill_vid() for this vlan
- */
- if (!vlan->kobj) {
- ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan);
- if (ret) {
- batadv_softif_vlan_put(vlan);
- return ret;
- }
- }
-
/* add a new TT local entry. This one will be marked with the NOPURGE
* flag. This must be added again, even if the vlan object already
* exists, because the entry was deleted by kill_vid()
@@ -706,7 +677,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
/**
* batadv_interface_kill_vid() - ndo_kill_vid API implementation
* @dev: the netdev of the mesh interface
- * @proto: protocol of the the vlan id
+ * @proto: protocol of the vlan id
* @vid: identifier of the deleted vlan
*
* Destroy all the internal structures used to handle the vlan identified by vid
@@ -845,22 +816,18 @@ static int batadv_softif_init_late(struct net_device *dev)
batadv_nc_init_bat_priv(bat_priv);
- ret = batadv_algo_select(bat_priv, batadv_routing_algo);
- if (ret < 0)
- goto free_bat_counters;
-
- ret = batadv_debugfs_add_meshif(dev);
- if (ret < 0)
- goto free_bat_counters;
+ if (!bat_priv->algo_ops) {
+ ret = batadv_algo_select(bat_priv, batadv_routing_algo);
+ if (ret < 0)
+ goto free_bat_counters;
+ }
ret = batadv_mesh_init(dev);
if (ret < 0)
- goto unreg_debugfs;
+ goto free_bat_counters;
return 0;
-unreg_debugfs:
- batadv_debugfs_del_meshif(dev);
free_bat_counters:
free_percpu(bat_priv->bat_counters);
bat_priv->bat_counters = NULL;
@@ -914,7 +881,7 @@ static int batadv_softif_slave_del(struct net_device *dev,
if (!hard_iface || hard_iface->soft_iface != dev)
goto out;
- batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_KEEP);
+ batadv_hardif_disable_interface(hard_iface);
ret = 0;
out:
@@ -1037,7 +1004,6 @@ static const struct ethtool_ops batadv_ethtool_ops = {
*/
static void batadv_softif_free(struct net_device *dev)
{
- batadv_debugfs_del_meshif(dev);
batadv_mesh_free(dev);
/* some scheduled RCU callbacks need the bat_priv struct to accomplish
@@ -1074,6 +1040,59 @@ static void batadv_softif_init_early(struct net_device *dev)
}
/**
+ * batadv_softif_validate() - validate configuration of new batadv link
+ * @tb: IFLA_INFO_DATA netlink attributes
+ * @data: enum batadv_ifla_attrs attributes
+ * @extack: extended ACK report struct
+ *
+ * Return: 0 if successful or error otherwise.
+ */
+static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct batadv_algo_ops *algo_ops;
+
+ if (!data)
+ return 0;
+
+ if (data[IFLA_BATADV_ALGO_NAME]) {
+ algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME]));
+ if (!algo_ops)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * batadv_softif_newlink() - pre-initialize and register new batadv link
+ * @src_net: the applicable net namespace
+ * @dev: network device to register
+ * @tb: IFLA_INFO_DATA netlink attributes
+ * @data: enum batadv_ifla_attrs attributes
+ * @extack: extended ACK report struct
+ *
+ * Return: 0 if successful or error otherwise.
+ */
+static int batadv_softif_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ const char *algo_name;
+ int err;
+
+ if (data && data[IFLA_BATADV_ALGO_NAME]) {
+ algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]);
+ err = batadv_algo_select(bat_priv, algo_name);
+ if (err)
+ return -EINVAL;
+ }
+
+ return register_netdevice(dev);
+}
+
+/**
* batadv_softif_create() - Create and register soft interface
* @net: the applicable net namespace
* @name: name of the new soft interface
@@ -1106,28 +1125,6 @@ struct net_device *batadv_softif_create(struct net *net, const char *name)
}
/**
- * batadv_softif_destroy_sysfs() - deletion of batadv_soft_interface via sysfs
- * @soft_iface: the to-be-removed batman-adv interface
- */
-void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
-{
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
- struct batadv_softif_vlan *vlan;
-
- ASSERT_RTNL();
-
- /* destroy the "untagged" VLAN */
- vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
- if (vlan) {
- batadv_softif_destroy_vlan(bat_priv, vlan);
- batadv_softif_vlan_put(vlan);
- }
-
- batadv_sysfs_del_meshif(soft_iface);
- unregister_netdevice(soft_iface);
-}
-
-/**
* batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via
* netlink
* @soft_iface: the to-be-removed batman-adv interface
@@ -1142,8 +1139,7 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
list_for_each_entry(hard_iface, &batadv_hardif_list, list) {
if (hard_iface->soft_iface == soft_iface)
- batadv_hardif_disable_interface(hard_iface,
- BATADV_IF_CLEANUP_KEEP);
+ batadv_hardif_disable_interface(hard_iface);
}
/* destroy the "untagged" VLAN */
@@ -1153,7 +1149,6 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
batadv_softif_vlan_put(vlan);
}
- batadv_sysfs_del_meshif(soft_iface);
unregister_netdevice_queue(soft_iface, head);
}
@@ -1171,9 +1166,17 @@ bool batadv_softif_is_valid(const struct net_device *net_dev)
return false;
}
+static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = {
+ [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING },
+};
+
struct rtnl_link_ops batadv_link_ops __read_mostly = {
.kind = "batadv",
.priv_size = sizeof(struct batadv_priv),
.setup = batadv_softif_init_early,
+ .maxtype = IFLA_BATADV_MAX,
+ .policy = batadv_ifla_policy,
+ .validate = batadv_softif_validate,
+ .newlink = batadv_softif_newlink,
.dellink = batadv_softif_destroy_netlink,
};
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 534e08d6ad91..74716d9ca4f6 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -20,7 +20,6 @@ void batadv_interface_rx(struct net_device *soft_iface,
struct sk_buff *skb, int hdr_size,
struct batadv_orig_node *orig_node);
struct net_device *batadv_softif_create(struct net *net, const char *name);
-void batadv_softif_destroy_sysfs(struct net_device *soft_iface);
bool batadv_softif_is_valid(const struct net_device *net_dev);
extern struct rtnl_link_ops batadv_link_ops;
int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid);
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
deleted file mode 100644
index 0f962dcd239e..000000000000
--- a/net/batman-adv/sysfs.c
+++ /dev/null
@@ -1,1272 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- */
-
-#include "sysfs.h"
-#include "main.h"
-
-#include <asm/current.h>
-#include <linux/atomic.h>
-#include <linux/compiler.h>
-#include <linux/device.h>
-#include <linux/errno.h>
-#include <linux/gfp.h>
-#include <linux/if.h>
-#include <linux/if_vlan.h>
-#include <linux/kernel.h>
-#include <linux/kobject.h>
-#include <linux/kref.h>
-#include <linux/limits.h>
-#include <linux/netdevice.h>
-#include <linux/printk.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/rtnetlink.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/stddef.h>
-#include <linux/string.h>
-#include <linux/stringify.h>
-#include <linux/workqueue.h>
-#include <uapi/linux/batadv_packet.h>
-#include <uapi/linux/batman_adv.h>
-
-#include "bridge_loop_avoidance.h"
-#include "distributed-arp-table.h"
-#include "gateway_client.h"
-#include "gateway_common.h"
-#include "hard-interface.h"
-#include "log.h"
-#include "netlink.h"
-#include "network-coding.h"
-#include "soft-interface.h"
-
-/**
- * batadv_sysfs_deprecated() - Log use of deprecated batadv sysfs access
- * @attr: attribute which was accessed
- */
-static void batadv_sysfs_deprecated(struct attribute *attr)
-{
- pr_warn_ratelimited(DEPRECATED "%s (pid %d) Use of sysfs file \"%s\".\nUse batadv genl family instead",
- current->comm, task_pid_nr(current), attr->name);
-}
-
-static struct net_device *batadv_kobj_to_netdev(struct kobject *obj)
-{
- struct device *dev = container_of(obj->parent, struct device, kobj);
-
- return to_net_dev(dev);
-}
-
-static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(obj);
-
- return netdev_priv(net_dev);
-}
-
-/**
- * batadv_vlan_kobj_to_batpriv() - convert a vlan kobj in the associated batpriv
- * @obj: kobject to covert
- *
- * Return: the associated batadv_priv struct.
- */
-static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj)
-{
- /* VLAN specific attributes are located in the root sysfs folder if they
- * refer to the untagged VLAN..
- */
- if (!strcmp(BATADV_SYSFS_IF_MESH_SUBDIR, obj->name))
- return batadv_kobj_to_batpriv(obj);
-
- /* ..while the attributes for the tagged vlans are located in
- * the in the corresponding "vlan%VID" subfolder
- */
- return batadv_kobj_to_batpriv(obj->parent);
-}
-
-/**
- * batadv_kobj_to_vlan() - convert a kobj in the associated softif_vlan struct
- * @bat_priv: the bat priv with all the soft interface information
- * @obj: kobject to covert
- *
- * Return: the associated softif_vlan struct if found, NULL otherwise.
- */
-static struct batadv_softif_vlan *
-batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj)
-{
- struct batadv_softif_vlan *vlan_tmp, *vlan = NULL;
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) {
- if (vlan_tmp->kobj != obj)
- continue;
-
- if (!kref_get_unless_zero(&vlan_tmp->refcount))
- continue;
-
- vlan = vlan_tmp;
- break;
- }
- rcu_read_unlock();
-
- return vlan;
-}
-
-/* Use this, if you have customized show and store functions for vlan attrs */
-#define BATADV_ATTR_VLAN(_name, _mode, _show, _store) \
-struct batadv_attribute batadv_attr_vlan_##_name = { \
- .attr = {.name = __stringify(_name), \
- .mode = _mode }, \
- .show = _show, \
- .store = _store, \
-}
-
-/* Use this, if you have customized show and store functions */
-#define BATADV_ATTR(_name, _mode, _show, _store) \
-struct batadv_attribute batadv_attr_##_name = { \
- .attr = {.name = __stringify(_name), \
- .mode = _mode }, \
- .show = _show, \
- .store = _store, \
-}
-
-#define BATADV_ATTR_SIF_STORE_BOOL(_name, _post_func) \
-ssize_t batadv_store_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff, \
- size_t count) \
-{ \
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
- struct batadv_priv *bat_priv = netdev_priv(net_dev); \
- ssize_t length; \
- \
- batadv_sysfs_deprecated(attr); \
- length = __batadv_store_bool_attr(buff, count, _post_func, attr,\
- &bat_priv->_name, net_dev); \
- \
- batadv_netlink_notify_mesh(bat_priv); \
- \
- return length; \
-}
-
-#define BATADV_ATTR_SIF_SHOW_BOOL(_name) \
-ssize_t batadv_show_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff) \
-{ \
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \
- \
- batadv_sysfs_deprecated(attr); \
- return sprintf(buff, "%s\n", \
- atomic_read(&bat_priv->_name) == 0 ? \
- "disabled" : "enabled"); \
-} \
-
-/* Use this, if you are going to turn a [name] in the soft-interface
- * (bat_priv) on or off
- */
-#define BATADV_ATTR_SIF_BOOL(_name, _mode, _post_func) \
- static BATADV_ATTR_SIF_STORE_BOOL(_name, _post_func) \
- static BATADV_ATTR_SIF_SHOW_BOOL(_name) \
- static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
- batadv_store_##_name)
-
-#define BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func) \
-ssize_t batadv_store_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff, \
- size_t count) \
-{ \
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
- struct batadv_priv *bat_priv = netdev_priv(net_dev); \
- ssize_t length; \
- \
- batadv_sysfs_deprecated(attr); \
- length = __batadv_store_uint_attr(buff, count, _min, _max, \
- _post_func, attr, \
- &bat_priv->_var, net_dev, \
- NULL); \
- \
- batadv_netlink_notify_mesh(bat_priv); \
- \
- return length; \
-}
-
-#define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \
-ssize_t batadv_show_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff) \
-{ \
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \
- \
- batadv_sysfs_deprecated(attr); \
- return sprintf(buff, "%i\n", atomic_read(&bat_priv->_var)); \
-} \
-
-/* Use this, if you are going to set [name] in the soft-interface
- * (bat_priv) to an unsigned integer value
- */
-#define BATADV_ATTR_SIF_UINT(_name, _var, _mode, _min, _max, _post_func)\
- static BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func)\
- static BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \
- static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
- batadv_store_##_name)
-
-#define BATADV_ATTR_VLAN_STORE_BOOL(_name, _post_func) \
-ssize_t batadv_store_vlan_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff, \
- size_t count) \
-{ \
- struct batadv_priv *bat_priv = batadv_vlan_kobj_to_batpriv(kobj);\
- struct batadv_softif_vlan *vlan = batadv_kobj_to_vlan(bat_priv, \
- kobj); \
- size_t res = __batadv_store_bool_attr(buff, count, _post_func, \
- attr, &vlan->_name, \
- bat_priv->soft_iface); \
- \
- batadv_sysfs_deprecated(attr); \
- if (vlan->vid) \
- batadv_netlink_notify_vlan(bat_priv, vlan); \
- else \
- batadv_netlink_notify_mesh(bat_priv); \
- \
- batadv_softif_vlan_put(vlan); \
- return res; \
-}
-
-#define BATADV_ATTR_VLAN_SHOW_BOOL(_name) \
-ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff) \
-{ \
- struct batadv_priv *bat_priv = batadv_vlan_kobj_to_batpriv(kobj);\
- struct batadv_softif_vlan *vlan = batadv_kobj_to_vlan(bat_priv, \
- kobj); \
- size_t res = sprintf(buff, "%s\n", \
- atomic_read(&vlan->_name) == 0 ? \
- "disabled" : "enabled"); \
- \
- batadv_sysfs_deprecated(attr); \
- batadv_softif_vlan_put(vlan); \
- return res; \
-}
-
-/* Use this, if you are going to turn a [name] in the vlan struct on or off */
-#define BATADV_ATTR_VLAN_BOOL(_name, _mode, _post_func) \
- static BATADV_ATTR_VLAN_STORE_BOOL(_name, _post_func) \
- static BATADV_ATTR_VLAN_SHOW_BOOL(_name) \
- static BATADV_ATTR_VLAN(_name, _mode, batadv_show_vlan_##_name, \
- batadv_store_vlan_##_name)
-
-#define BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, _max, _post_func) \
-ssize_t batadv_store_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff, \
- size_t count) \
-{ \
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
- struct batadv_hard_iface *hard_iface; \
- struct batadv_priv *bat_priv; \
- ssize_t length; \
- \
- batadv_sysfs_deprecated(attr); \
- hard_iface = batadv_hardif_get_by_netdev(net_dev); \
- if (!hard_iface) \
- return 0; \
- \
- length = __batadv_store_uint_attr(buff, count, _min, _max, \
- _post_func, attr, \
- &hard_iface->_var, \
- hard_iface->soft_iface, \
- net_dev); \
- \
- if (hard_iface->soft_iface) { \
- bat_priv = netdev_priv(hard_iface->soft_iface); \
- batadv_netlink_notify_hardif(bat_priv, hard_iface); \
- } \
- \
- batadv_hardif_put(hard_iface); \
- return length; \
-}
-
-#define BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \
-ssize_t batadv_show_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff) \
-{ \
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
- struct batadv_hard_iface *hard_iface; \
- ssize_t length; \
- \
- batadv_sysfs_deprecated(attr); \
- hard_iface = batadv_hardif_get_by_netdev(net_dev); \
- if (!hard_iface) \
- return 0; \
- \
- length = sprintf(buff, "%i\n", atomic_read(&hard_iface->_var)); \
- \
- batadv_hardif_put(hard_iface); \
- return length; \
-}
-
-/* Use this, if you are going to set [name] in hard_iface to an
- * unsigned integer value
- */
-#define BATADV_ATTR_HIF_UINT(_name, _var, _mode, _min, _max, _post_func)\
- static BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, \
- _max, _post_func) \
- static BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \
- static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
- batadv_store_##_name)
-
-static int batadv_store_bool_attr(char *buff, size_t count,
- struct net_device *net_dev,
- const char *attr_name, atomic_t *attr,
- bool *changed)
-{
- int enabled = -1;
-
- *changed = false;
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- if ((strncmp(buff, "1", 2) == 0) ||
- (strncmp(buff, "enable", 7) == 0) ||
- (strncmp(buff, "enabled", 8) == 0))
- enabled = 1;
-
- if ((strncmp(buff, "0", 2) == 0) ||
- (strncmp(buff, "disable", 8) == 0) ||
- (strncmp(buff, "disabled", 9) == 0))
- enabled = 0;
-
- if (enabled < 0) {
- batadv_info(net_dev, "%s: Invalid parameter received: %s\n",
- attr_name, buff);
- return -EINVAL;
- }
-
- if (atomic_read(attr) == enabled)
- return count;
-
- batadv_info(net_dev, "%s: Changing from: %s to: %s\n", attr_name,
- atomic_read(attr) == 1 ? "enabled" : "disabled",
- enabled == 1 ? "enabled" : "disabled");
-
- *changed = true;
-
- atomic_set(attr, (unsigned int)enabled);
- return count;
-}
-
-static inline ssize_t
-__batadv_store_bool_attr(char *buff, size_t count,
- void (*post_func)(struct net_device *),
- struct attribute *attr,
- atomic_t *attr_store, struct net_device *net_dev)
-{
- bool changed;
- int ret;
-
- ret = batadv_store_bool_attr(buff, count, net_dev, attr->name,
- attr_store, &changed);
- if (post_func && changed)
- post_func(net_dev);
-
- return ret;
-}
-
-static int batadv_store_uint_attr(const char *buff, size_t count,
- struct net_device *net_dev,
- struct net_device *slave_dev,
- const char *attr_name,
- unsigned int min, unsigned int max,
- atomic_t *attr)
-{
- char ifname[IFNAMSIZ + 3] = "";
- unsigned long uint_val;
- int ret;
-
- ret = kstrtoul(buff, 10, &uint_val);
- if (ret) {
- batadv_info(net_dev, "%s: Invalid parameter received: %s\n",
- attr_name, buff);
- return -EINVAL;
- }
-
- if (uint_val < min) {
- batadv_info(net_dev, "%s: Value is too small: %lu min: %u\n",
- attr_name, uint_val, min);
- return -EINVAL;
- }
-
- if (uint_val > max) {
- batadv_info(net_dev, "%s: Value is too big: %lu max: %u\n",
- attr_name, uint_val, max);
- return -EINVAL;
- }
-
- if (atomic_read(attr) == uint_val)
- return count;
-
- if (slave_dev)
- snprintf(ifname, sizeof(ifname), "%s: ", slave_dev->name);
-
- batadv_info(net_dev, "%s: %sChanging from: %i to: %lu\n",
- attr_name, ifname, atomic_read(attr), uint_val);
-
- atomic_set(attr, uint_val);
- return count;
-}
-
-static ssize_t __batadv_store_uint_attr(const char *buff, size_t count,
- int min, int max,
- void (*post_func)(struct net_device *),
- const struct attribute *attr,
- atomic_t *attr_store,
- struct net_device *net_dev,
- struct net_device *slave_dev)
-{
- int ret;
-
- ret = batadv_store_uint_attr(buff, count, net_dev, slave_dev,
- attr->name, min, max, attr_store);
- if (post_func && ret)
- post_func(net_dev);
-
- return ret;
-}
-
-static ssize_t batadv_show_bat_algo(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
-
- batadv_sysfs_deprecated(attr);
- return sprintf(buff, "%s\n", bat_priv->algo_ops->name);
-}
-
-static void batadv_post_gw_reselect(struct net_device *net_dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
-
- batadv_gw_reselect(bat_priv);
-}
-
-static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr,
- char *buff)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
- int bytes_written;
-
- batadv_sysfs_deprecated(attr);
-
- /* GW mode is not available if the routing algorithm in use does not
- * implement the GW API
- */
- if (!bat_priv->algo_ops->gw.get_best_gw_node ||
- !bat_priv->algo_ops->gw.is_eligible)
- return -ENOENT;
-
- switch (atomic_read(&bat_priv->gw.mode)) {
- case BATADV_GW_MODE_CLIENT:
- bytes_written = sprintf(buff, "%s\n",
- BATADV_GW_MODE_CLIENT_NAME);
- break;
- case BATADV_GW_MODE_SERVER:
- bytes_written = sprintf(buff, "%s\n",
- BATADV_GW_MODE_SERVER_NAME);
- break;
- default:
- bytes_written = sprintf(buff, "%s\n",
- BATADV_GW_MODE_OFF_NAME);
- break;
- }
-
- return bytes_written;
-}
-
-static ssize_t batadv_store_gw_mode(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- char *curr_gw_mode_str;
- int gw_mode_tmp = -1;
-
- batadv_sysfs_deprecated(attr);
-
- /* toggling GW mode is allowed only if the routing algorithm in use
- * provides the GW API
- */
- if (!bat_priv->algo_ops->gw.get_best_gw_node ||
- !bat_priv->algo_ops->gw.is_eligible)
- return -EINVAL;
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- if (strncmp(buff, BATADV_GW_MODE_OFF_NAME,
- strlen(BATADV_GW_MODE_OFF_NAME)) == 0)
- gw_mode_tmp = BATADV_GW_MODE_OFF;
-
- if (strncmp(buff, BATADV_GW_MODE_CLIENT_NAME,
- strlen(BATADV_GW_MODE_CLIENT_NAME)) == 0)
- gw_mode_tmp = BATADV_GW_MODE_CLIENT;
-
- if (strncmp(buff, BATADV_GW_MODE_SERVER_NAME,
- strlen(BATADV_GW_MODE_SERVER_NAME)) == 0)
- gw_mode_tmp = BATADV_GW_MODE_SERVER;
-
- if (gw_mode_tmp < 0) {
- batadv_info(net_dev,
- "Invalid parameter for 'gw mode' setting received: %s\n",
- buff);
- return -EINVAL;
- }
-
- if (atomic_read(&bat_priv->gw.mode) == gw_mode_tmp)
- return count;
-
- switch (atomic_read(&bat_priv->gw.mode)) {
- case BATADV_GW_MODE_CLIENT:
- curr_gw_mode_str = BATADV_GW_MODE_CLIENT_NAME;
- break;
- case BATADV_GW_MODE_SERVER:
- curr_gw_mode_str = BATADV_GW_MODE_SERVER_NAME;
- break;
- default:
- curr_gw_mode_str = BATADV_GW_MODE_OFF_NAME;
- break;
- }
-
- batadv_info(net_dev, "Changing gw mode from: %s to: %s\n",
- curr_gw_mode_str, buff);
-
- /* Invoking batadv_gw_reselect() is not enough to really de-select the
- * current GW. It will only instruct the gateway client code to perform
- * a re-election the next time that this is needed.
- *
- * When gw client mode is being switched off the current GW must be
- * de-selected explicitly otherwise no GW_ADD uevent is thrown on
- * client mode re-activation. This is operation is performed in
- * batadv_gw_check_client_stop().
- */
- batadv_gw_reselect(bat_priv);
- /* always call batadv_gw_check_client_stop() before changing the gateway
- * state
- */
- batadv_gw_check_client_stop(bat_priv);
- atomic_set(&bat_priv->gw.mode, (unsigned int)gw_mode_tmp);
- batadv_gw_tvlv_container_update(bat_priv);
-
- batadv_netlink_notify_mesh(bat_priv);
-
- return count;
-}
-
-static ssize_t batadv_show_gw_sel_class(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
-
- batadv_sysfs_deprecated(attr);
-
- /* GW selection class is not available if the routing algorithm in use
- * does not implement the GW API
- */
- if (!bat_priv->algo_ops->gw.get_best_gw_node ||
- !bat_priv->algo_ops->gw.is_eligible)
- return -ENOENT;
-
- if (bat_priv->algo_ops->gw.show_sel_class)
- return bat_priv->algo_ops->gw.show_sel_class(bat_priv, buff);
-
- return sprintf(buff, "%i\n", atomic_read(&bat_priv->gw.sel_class));
-}
-
-static ssize_t batadv_store_gw_sel_class(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
- ssize_t length;
-
- batadv_sysfs_deprecated(attr);
-
- /* setting the GW selection class is allowed only if the routing
- * algorithm in use implements the GW API
- */
- if (!bat_priv->algo_ops->gw.get_best_gw_node ||
- !bat_priv->algo_ops->gw.is_eligible)
- return -EINVAL;
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- if (bat_priv->algo_ops->gw.store_sel_class)
- return bat_priv->algo_ops->gw.store_sel_class(bat_priv, buff,
- count);
-
- length = __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE,
- batadv_post_gw_reselect, attr,
- &bat_priv->gw.sel_class,
- bat_priv->soft_iface, NULL);
-
- batadv_netlink_notify_mesh(bat_priv);
-
- return length;
-}
-
-static ssize_t batadv_show_gw_bwidth(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
- u32 down, up;
-
- batadv_sysfs_deprecated(attr);
-
- down = atomic_read(&bat_priv->gw.bandwidth_down);
- up = atomic_read(&bat_priv->gw.bandwidth_up);
-
- return sprintf(buff, "%u.%u/%u.%u MBit\n", down / 10,
- down % 10, up / 10, up % 10);
-}
-
-static ssize_t batadv_store_gw_bwidth(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- ssize_t length;
-
- batadv_sysfs_deprecated(attr);
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- length = batadv_gw_bandwidth_set(net_dev, buff, count);
-
- batadv_netlink_notify_mesh(bat_priv);
-
- return length;
-}
-
-/**
- * batadv_show_isolation_mark() - print the current isolation mark/mask
- * @kobj: kobject representing the private mesh sysfs directory
- * @attr: the batman-adv attribute the user is interacting with
- * @buff: the buffer that will contain the data to send back to the user
- *
- * Return: the number of bytes written into 'buff' on success or a negative
- * error code in case of failure
- */
-static ssize_t batadv_show_isolation_mark(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
-
- batadv_sysfs_deprecated(attr);
- return sprintf(buff, "%#.8x/%#.8x\n", bat_priv->isolation_mark,
- bat_priv->isolation_mark_mask);
-}
-
-/**
- * batadv_store_isolation_mark() - parse and store the isolation mark/mask
- * entered by the user
- * @kobj: kobject representing the private mesh sysfs directory
- * @attr: the batman-adv attribute the user is interacting with
- * @buff: the buffer containing the user data
- * @count: number of bytes in the buffer
- *
- * Return: 'count' on success or a negative error code in case of failure
- */
-static ssize_t batadv_store_isolation_mark(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- u32 mark, mask;
- char *mask_ptr;
-
- batadv_sysfs_deprecated(attr);
-
- /* parse the mask if it has been specified, otherwise assume the mask is
- * the biggest possible
- */
- mask = 0xFFFFFFFF;
- mask_ptr = strchr(buff, '/');
- if (mask_ptr) {
- *mask_ptr = '\0';
- mask_ptr++;
-
- /* the mask must be entered in hex base as it is going to be a
- * bitmask and not a prefix length
- */
- if (kstrtou32(mask_ptr, 16, &mask) < 0)
- return -EINVAL;
- }
-
- /* the mark can be entered in any base */
- if (kstrtou32(buff, 0, &mark) < 0)
- return -EINVAL;
-
- bat_priv->isolation_mark_mask = mask;
- /* erase bits not covered by the mask */
- bat_priv->isolation_mark = mark & bat_priv->isolation_mark_mask;
-
- batadv_info(net_dev,
- "New skb mark for extended isolation: %#.8x/%#.8x\n",
- bat_priv->isolation_mark, bat_priv->isolation_mark_mask);
-
- batadv_netlink_notify_mesh(bat_priv);
-
- return count;
-}
-
-BATADV_ATTR_SIF_BOOL(aggregated_ogms, 0644, NULL);
-BATADV_ATTR_SIF_BOOL(bonding, 0644, NULL);
-#ifdef CONFIG_BATMAN_ADV_BLA
-BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, 0644, batadv_bla_status_update);
-#endif
-#ifdef CONFIG_BATMAN_ADV_DAT
-BATADV_ATTR_SIF_BOOL(distributed_arp_table, 0644, batadv_dat_status_update);
-#endif
-BATADV_ATTR_SIF_BOOL(fragmentation, 0644, batadv_update_min_mtu);
-static BATADV_ATTR(routing_algo, 0444, batadv_show_bat_algo, NULL);
-static BATADV_ATTR(gw_mode, 0644, batadv_show_gw_mode, batadv_store_gw_mode);
-BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, 0644, 2 * BATADV_JITTER,
- INT_MAX, NULL);
-BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, 0644, 0, BATADV_TQ_MAX_VALUE,
- NULL);
-static BATADV_ATTR(gw_sel_class, 0644, batadv_show_gw_sel_class,
- batadv_store_gw_sel_class);
-static BATADV_ATTR(gw_bandwidth, 0644, batadv_show_gw_bwidth,
- batadv_store_gw_bwidth);
-#ifdef CONFIG_BATMAN_ADV_MCAST
-BATADV_ATTR_SIF_BOOL(multicast_mode, 0644, NULL);
-#endif
-#ifdef CONFIG_BATMAN_ADV_DEBUG
-BATADV_ATTR_SIF_UINT(log_level, log_level, 0644, 0, BATADV_DBG_ALL, NULL);
-#endif
-#ifdef CONFIG_BATMAN_ADV_NC
-BATADV_ATTR_SIF_BOOL(network_coding, 0644, batadv_nc_status_update);
-#endif
-static BATADV_ATTR(isolation_mark, 0644, batadv_show_isolation_mark,
- batadv_store_isolation_mark);
-
-static struct batadv_attribute *batadv_mesh_attrs[] = {
- &batadv_attr_aggregated_ogms,
- &batadv_attr_bonding,
-#ifdef CONFIG_BATMAN_ADV_BLA
- &batadv_attr_bridge_loop_avoidance,
-#endif
-#ifdef CONFIG_BATMAN_ADV_DAT
- &batadv_attr_distributed_arp_table,
-#endif
-#ifdef CONFIG_BATMAN_ADV_MCAST
- &batadv_attr_multicast_mode,
-#endif
- &batadv_attr_fragmentation,
- &batadv_attr_routing_algo,
- &batadv_attr_gw_mode,
- &batadv_attr_orig_interval,
- &batadv_attr_hop_penalty,
- &batadv_attr_gw_sel_class,
- &batadv_attr_gw_bandwidth,
-#ifdef CONFIG_BATMAN_ADV_DEBUG
- &batadv_attr_log_level,
-#endif
-#ifdef CONFIG_BATMAN_ADV_NC
- &batadv_attr_network_coding,
-#endif
- &batadv_attr_isolation_mark,
- NULL,
-};
-
-BATADV_ATTR_VLAN_BOOL(ap_isolation, 0644, NULL);
-
-/* array of vlan specific sysfs attributes */
-static struct batadv_attribute *batadv_vlan_attrs[] = {
- &batadv_attr_vlan_ap_isolation,
- NULL,
-};
-
-/**
- * batadv_sysfs_add_meshif() - Add soft interface specific sysfs entries
- * @dev: netdev struct of the soft interface
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_sysfs_add_meshif(struct net_device *dev)
-{
- struct kobject *batif_kobject = &dev->dev.kobj;
- struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_attribute **bat_attr;
- int err;
-
- bat_priv->mesh_obj = kobject_create_and_add(BATADV_SYSFS_IF_MESH_SUBDIR,
- batif_kobject);
- if (!bat_priv->mesh_obj) {
- batadv_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name,
- BATADV_SYSFS_IF_MESH_SUBDIR);
- goto out;
- }
-
- for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) {
- err = sysfs_create_file(bat_priv->mesh_obj,
- &((*bat_attr)->attr));
- if (err) {
- batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n",
- dev->name, BATADV_SYSFS_IF_MESH_SUBDIR,
- ((*bat_attr)->attr).name);
- goto rem_attr;
- }
- }
-
- return 0;
-
-rem_attr:
- for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr)
- sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
-
- kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE);
- kobject_del(bat_priv->mesh_obj);
- kobject_put(bat_priv->mesh_obj);
- bat_priv->mesh_obj = NULL;
-out:
- return -ENOMEM;
-}
-
-/**
- * batadv_sysfs_del_meshif() - Remove soft interface specific sysfs entries
- * @dev: netdev struct of the soft interface
- */
-void batadv_sysfs_del_meshif(struct net_device *dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_attribute **bat_attr;
-
- for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr)
- sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
-
- kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE);
- kobject_del(bat_priv->mesh_obj);
- kobject_put(bat_priv->mesh_obj);
- bat_priv->mesh_obj = NULL;
-}
-
-/**
- * batadv_sysfs_add_vlan() - add all the needed sysfs objects for the new vlan
- * @dev: netdev of the mesh interface
- * @vlan: private data of the newly added VLAN interface
- *
- * Return: 0 on success and -ENOMEM if any of the structure allocations fails.
- */
-int batadv_sysfs_add_vlan(struct net_device *dev,
- struct batadv_softif_vlan *vlan)
-{
- char vlan_subdir[sizeof(BATADV_SYSFS_VLAN_SUBDIR_PREFIX) + 5];
- struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_attribute **bat_attr;
- int err;
-
- if (vlan->vid & BATADV_VLAN_HAS_TAG) {
- sprintf(vlan_subdir, BATADV_SYSFS_VLAN_SUBDIR_PREFIX "%hu",
- vlan->vid & VLAN_VID_MASK);
-
- vlan->kobj = kobject_create_and_add(vlan_subdir,
- bat_priv->mesh_obj);
- if (!vlan->kobj) {
- batadv_err(dev, "Can't add sysfs directory: %s/%s\n",
- dev->name, vlan_subdir);
- goto out;
- }
- } else {
- /* the untagged LAN uses the root folder to store its "VLAN
- * specific attributes"
- */
- vlan->kobj = bat_priv->mesh_obj;
- kobject_get(bat_priv->mesh_obj);
- }
-
- for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) {
- err = sysfs_create_file(vlan->kobj,
- &((*bat_attr)->attr));
- if (err) {
- batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n",
- dev->name, vlan_subdir,
- ((*bat_attr)->attr).name);
- goto rem_attr;
- }
- }
-
- return 0;
-
-rem_attr:
- for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr)
- sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr));
-
- if (vlan->kobj != bat_priv->mesh_obj) {
- kobject_uevent(vlan->kobj, KOBJ_REMOVE);
- kobject_del(vlan->kobj);
- }
- kobject_put(vlan->kobj);
- vlan->kobj = NULL;
-out:
- return -ENOMEM;
-}
-
-/**
- * batadv_sysfs_del_vlan() - remove all the sysfs objects for a given VLAN
- * @bat_priv: the bat priv with all the soft interface information
- * @vlan: the private data of the VLAN to destroy
- */
-void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv,
- struct batadv_softif_vlan *vlan)
-{
- struct batadv_attribute **bat_attr;
-
- for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr)
- sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr));
-
- if (vlan->kobj != bat_priv->mesh_obj) {
- kobject_uevent(vlan->kobj, KOBJ_REMOVE);
- kobject_del(vlan->kobj);
- }
- kobject_put(vlan->kobj);
- vlan->kobj = NULL;
-}
-
-static ssize_t batadv_show_mesh_iface(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_hard_iface *hard_iface;
- ssize_t length;
- const char *ifname;
-
- batadv_sysfs_deprecated(attr);
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface)
- return 0;
-
- if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
- ifname = "none";
- else
- ifname = hard_iface->soft_iface->name;
-
- length = sprintf(buff, "%s\n", ifname);
-
- batadv_hardif_put(hard_iface);
-
- return length;
-}
-
-/**
- * batadv_store_mesh_iface_finish() - store new hardif mesh_iface state
- * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
- * @ifname: name of soft-interface to modify
- *
- * Changes the parts of the hard+soft interface which can not be modified under
- * sysfs lock (to prevent deadlock situations).
- *
- * Return: 0 on success, 0 < on failure
- */
-static int batadv_store_mesh_iface_finish(struct net_device *net_dev,
- char ifname[IFNAMSIZ])
-{
- struct net *net = dev_net(net_dev);
- struct batadv_hard_iface *hard_iface;
- int status_tmp;
- int ret = 0;
-
- ASSERT_RTNL();
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface)
- return 0;
-
- if (strncmp(ifname, "none", 4) == 0)
- status_tmp = BATADV_IF_NOT_IN_USE;
- else
- status_tmp = BATADV_IF_I_WANT_YOU;
-
- if (hard_iface->if_status == status_tmp)
- goto out;
-
- if (hard_iface->soft_iface &&
- strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0)
- goto out;
-
- if (status_tmp == BATADV_IF_NOT_IN_USE) {
- batadv_hardif_disable_interface(hard_iface,
- BATADV_IF_CLEANUP_AUTO);
- goto out;
- }
-
- /* if the interface already is in use */
- if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
- batadv_hardif_disable_interface(hard_iface,
- BATADV_IF_CLEANUP_AUTO);
-
- ret = batadv_hardif_enable_interface(hard_iface, net, ifname);
-out:
- batadv_hardif_put(hard_iface);
- return ret;
-}
-
-/**
- * batadv_store_mesh_iface_work() - store new hardif mesh_iface state
- * @work: work queue item
- *
- * Changes the parts of the hard+soft interface which can not be modified under
- * sysfs lock (to prevent deadlock situations).
- */
-static void batadv_store_mesh_iface_work(struct work_struct *work)
-{
- struct batadv_store_mesh_work *store_work;
- int ret;
-
- store_work = container_of(work, struct batadv_store_mesh_work, work);
-
- rtnl_lock();
- ret = batadv_store_mesh_iface_finish(store_work->net_dev,
- store_work->soft_iface_name);
- rtnl_unlock();
-
- if (ret < 0)
- pr_err("Failed to store new mesh_iface state %s for %s: %d\n",
- store_work->soft_iface_name, store_work->net_dev->name,
- ret);
-
- dev_put(store_work->net_dev);
- kfree(store_work);
-}
-
-static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_store_mesh_work *store_work;
-
- batadv_sysfs_deprecated(attr);
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- if (strlen(buff) >= IFNAMSIZ) {
- pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n",
- buff);
- return -EINVAL;
- }
-
- store_work = kmalloc(sizeof(*store_work), GFP_KERNEL);
- if (!store_work)
- return -ENOMEM;
-
- dev_hold(net_dev);
- INIT_WORK(&store_work->work, batadv_store_mesh_iface_work);
- store_work->net_dev = net_dev;
- strscpy(store_work->soft_iface_name, buff,
- sizeof(store_work->soft_iface_name));
-
- queue_work(batadv_event_workqueue, &store_work->work);
-
- return count;
-}
-
-static ssize_t batadv_show_iface_status(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_hard_iface *hard_iface;
- ssize_t length;
-
- batadv_sysfs_deprecated(attr);
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface)
- return 0;
-
- switch (hard_iface->if_status) {
- case BATADV_IF_TO_BE_REMOVED:
- length = sprintf(buff, "disabling\n");
- break;
- case BATADV_IF_INACTIVE:
- length = sprintf(buff, "inactive\n");
- break;
- case BATADV_IF_ACTIVE:
- length = sprintf(buff, "active\n");
- break;
- case BATADV_IF_TO_BE_ACTIVATED:
- length = sprintf(buff, "enabling\n");
- break;
- case BATADV_IF_NOT_IN_USE:
- default:
- length = sprintf(buff, "not in use\n");
- break;
- }
-
- batadv_hardif_put(hard_iface);
-
- return length;
-}
-
-#ifdef CONFIG_BATMAN_ADV_BATMAN_V
-
-/**
- * batadv_store_throughput_override() - parse and store throughput override
- * entered by the user
- * @kobj: kobject representing the private mesh sysfs directory
- * @attr: the batman-adv attribute the user is interacting with
- * @buff: the buffer containing the user data
- * @count: number of bytes in the buffer
- *
- * Return: 'count' on success or a negative error code in case of failure
- */
-static ssize_t batadv_store_throughput_override(struct kobject *kobj,
- struct attribute *attr,
- char *buff, size_t count)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_hard_iface *hard_iface;
- struct batadv_priv *bat_priv;
- u32 tp_override;
- u32 old_tp_override;
- bool ret;
-
- batadv_sysfs_deprecated(attr);
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface)
- return -EINVAL;
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- ret = batadv_parse_throughput(net_dev, buff, "throughput_override",
- &tp_override);
- if (!ret)
- goto out;
-
- old_tp_override = atomic_read(&hard_iface->bat_v.throughput_override);
- if (old_tp_override == tp_override)
- goto out;
-
- batadv_info(hard_iface->soft_iface,
- "%s: %s: Changing from: %u.%u MBit to: %u.%u MBit\n",
- "throughput_override", net_dev->name,
- old_tp_override / 10, old_tp_override % 10,
- tp_override / 10, tp_override % 10);
-
- atomic_set(&hard_iface->bat_v.throughput_override, tp_override);
-
- if (hard_iface->soft_iface) {
- bat_priv = netdev_priv(hard_iface->soft_iface);
- batadv_netlink_notify_hardif(bat_priv, hard_iface);
- }
-
-out:
- batadv_hardif_put(hard_iface);
- return count;
-}
-
-static ssize_t batadv_show_throughput_override(struct kobject *kobj,
- struct attribute *attr,
- char *buff)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_hard_iface *hard_iface;
- u32 tp_override;
-
- batadv_sysfs_deprecated(attr);
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface)
- return -EINVAL;
-
- tp_override = atomic_read(&hard_iface->bat_v.throughput_override);
-
- batadv_hardif_put(hard_iface);
- return sprintf(buff, "%u.%u MBit\n", tp_override / 10,
- tp_override % 10);
-}
-
-#endif
-
-static BATADV_ATTR(mesh_iface, 0644, batadv_show_mesh_iface,
- batadv_store_mesh_iface);
-static BATADV_ATTR(iface_status, 0444, batadv_show_iface_status, NULL);
-#ifdef CONFIG_BATMAN_ADV_BATMAN_V
-BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, 0644,
- 2 * BATADV_JITTER, INT_MAX, NULL);
-static BATADV_ATTR(throughput_override, 0644, batadv_show_throughput_override,
- batadv_store_throughput_override);
-#endif
-
-static struct batadv_attribute *batadv_batman_attrs[] = {
- &batadv_attr_mesh_iface,
- &batadv_attr_iface_status,
-#ifdef CONFIG_BATMAN_ADV_BATMAN_V
- &batadv_attr_elp_interval,
- &batadv_attr_throughput_override,
-#endif
- NULL,
-};
-
-/**
- * batadv_sysfs_add_hardif() - Add hard interface specific sysfs entries
- * @hardif_obj: address where to store the pointer to new sysfs folder
- * @dev: netdev struct of the hard interface
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_sysfs_add_hardif(struct kobject **hardif_obj, struct net_device *dev)
-{
- struct kobject *hardif_kobject = &dev->dev.kobj;
- struct batadv_attribute **bat_attr;
- int err;
-
- *hardif_obj = kobject_create_and_add(BATADV_SYSFS_IF_BAT_SUBDIR,
- hardif_kobject);
-
- if (!*hardif_obj) {
- batadv_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name,
- BATADV_SYSFS_IF_BAT_SUBDIR);
- goto out;
- }
-
- for (bat_attr = batadv_batman_attrs; *bat_attr; ++bat_attr) {
- err = sysfs_create_file(*hardif_obj, &((*bat_attr)->attr));
- if (err) {
- batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n",
- dev->name, BATADV_SYSFS_IF_BAT_SUBDIR,
- ((*bat_attr)->attr).name);
- goto rem_attr;
- }
- }
-
- return 0;
-
-rem_attr:
- for (bat_attr = batadv_batman_attrs; *bat_attr; ++bat_attr)
- sysfs_remove_file(*hardif_obj, &((*bat_attr)->attr));
-out:
- return -ENOMEM;
-}
-
-/**
- * batadv_sysfs_del_hardif() - Remove hard interface specific sysfs entries
- * @hardif_obj: address to the pointer to which stores batman-adv sysfs folder
- * of the hard interface
- */
-void batadv_sysfs_del_hardif(struct kobject **hardif_obj)
-{
- kobject_uevent(*hardif_obj, KOBJ_REMOVE);
- kobject_del(*hardif_obj);
- kobject_put(*hardif_obj);
- *hardif_obj = NULL;
-}
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
deleted file mode 100644
index d987f8b30a98..000000000000
--- a/net/batman-adv/sysfs.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- */
-
-#ifndef _NET_BATMAN_ADV_SYSFS_H_
-#define _NET_BATMAN_ADV_SYSFS_H_
-
-#include "main.h"
-
-#include <linux/kobject.h>
-#include <linux/netdevice.h>
-#include <linux/sysfs.h>
-#include <linux/types.h>
-
-#define BATADV_SYSFS_IF_MESH_SUBDIR "mesh"
-#define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv"
-/**
- * BATADV_SYSFS_VLAN_SUBDIR_PREFIX - prefix of the subfolder that will be
- * created in the sysfs hierarchy for each VLAN interface. The subfolder will
- * be named "BATADV_SYSFS_VLAN_SUBDIR_PREFIX%vid".
- */
-#define BATADV_SYSFS_VLAN_SUBDIR_PREFIX "vlan"
-
-/**
- * struct batadv_attribute - sysfs export helper for batman-adv attributes
- */
-struct batadv_attribute {
- /** @attr: sysfs attribute file */
- struct attribute attr;
-
- /**
- * @show: function to export the current attribute's content to sysfs
- */
- ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
- char *buf);
-
- /**
- * @store: function to load new value from character buffer and save it
- * in batman-adv attribute
- */
- ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
- char *buf, size_t count);
-};
-
-#ifdef CONFIG_BATMAN_ADV_SYSFS
-
-int batadv_sysfs_add_meshif(struct net_device *dev);
-void batadv_sysfs_del_meshif(struct net_device *dev);
-int batadv_sysfs_add_hardif(struct kobject **hardif_obj,
- struct net_device *dev);
-void batadv_sysfs_del_hardif(struct kobject **hardif_obj);
-int batadv_sysfs_add_vlan(struct net_device *dev,
- struct batadv_softif_vlan *vlan);
-void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv,
- struct batadv_softif_vlan *vlan);
-
-#else
-
-static inline int batadv_sysfs_add_meshif(struct net_device *dev)
-{
- return 0;
-}
-
-static inline void batadv_sysfs_del_meshif(struct net_device *dev)
-{
-}
-
-static inline int batadv_sysfs_add_hardif(struct kobject **hardif_obj,
- struct net_device *dev)
-{
- return 0;
-}
-
-static inline void batadv_sysfs_del_hardif(struct kobject **hardif_obj)
-{
-}
-
-static inline int batadv_sysfs_add_vlan(struct net_device *dev,
- struct batadv_softif_vlan *vlan)
-{
- return 0;
-}
-
-static inline void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv,
- struct batadv_softif_vlan *vlan)
-{
-}
-
-#endif
-
-#endif /* _NET_BATMAN_ADV_SYSFS_H_ */
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index db7e3774825b..d4e10005df6c 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -23,6 +23,7 @@
#include <linux/kthread.h>
#include <linux/limits.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/param.h>
#include <linux/printk.h>
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 98a0aaaf0d50..cd09916f97fe 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -30,7 +30,6 @@
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -1062,84 +1061,6 @@ container_register:
kfree(tt_data);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-/**
- * batadv_tt_local_seq_print_text() - Print the local tt table in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->tt.local_hash;
- struct batadv_tt_common_entry *tt_common_entry;
- struct batadv_tt_local_entry *tt_local;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- u32 i;
- int last_seen_secs;
- int last_seen_msecs;
- unsigned long last_seen_jiffies;
- bool no_purge;
- u16 np_flag = BATADV_TT_CLIENT_NOPURGE;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- seq_printf(seq,
- "Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n",
- net_dev->name, (u8)atomic_read(&bat_priv->tt.vn));
- seq_puts(seq,
- " Client VID Flags Last seen (CRC )\n");
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(tt_common_entry,
- head, hash_entry) {
- tt_local = container_of(tt_common_entry,
- struct batadv_tt_local_entry,
- common);
- last_seen_jiffies = jiffies - tt_local->last_seen;
- last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
- last_seen_secs = last_seen_msecs / 1000;
- last_seen_msecs = last_seen_msecs % 1000;
-
- no_purge = tt_common_entry->flags & np_flag;
- seq_printf(seq,
- " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n",
- tt_common_entry->addr,
- batadv_print_vid(tt_common_entry->vid),
- ((tt_common_entry->flags &
- BATADV_TT_CLIENT_ROAM) ? 'R' : '.'),
- no_purge ? 'P' : '.',
- ((tt_common_entry->flags &
- BATADV_TT_CLIENT_NEW) ? 'N' : '.'),
- ((tt_common_entry->flags &
- BATADV_TT_CLIENT_PENDING) ? 'X' : '.'),
- ((tt_common_entry->flags &
- BATADV_TT_CLIENT_WIFI) ? 'W' : '.'),
- ((tt_common_entry->flags &
- BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
- no_purge ? 0 : last_seen_secs,
- no_purge ? 0 : last_seen_msecs,
- tt_local->vlan->tt.crc);
- }
- rcu_read_unlock();
- }
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-#endif
-
/**
* batadv_tt_local_dump_entry() - Dump one TT local entry into a message
* @msg :Netlink message to dump into
@@ -1879,139 +1800,6 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv,
return best_entry;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_tt_global_print_entry() - print all orig nodes who announce the
- * address for this global entry
- * @bat_priv: the bat priv with all the soft interface information
- * @tt_global_entry: global translation table entry to be printed
- * @seq: debugfs table seq_file struct
- *
- * This function assumes the caller holds rcu_read_lock().
- */
-static void
-batadv_tt_global_print_entry(struct batadv_priv *bat_priv,
- struct batadv_tt_global_entry *tt_global_entry,
- struct seq_file *seq)
-{
- struct batadv_tt_orig_list_entry *orig_entry, *best_entry;
- struct batadv_tt_common_entry *tt_common_entry;
- struct batadv_orig_node_vlan *vlan;
- struct hlist_head *head;
- u8 last_ttvn;
- u16 flags;
-
- tt_common_entry = &tt_global_entry->common;
- flags = tt_common_entry->flags;
-
- best_entry = batadv_transtable_best_orig(bat_priv, tt_global_entry);
- if (best_entry) {
- vlan = batadv_orig_node_vlan_get(best_entry->orig_node,
- tt_common_entry->vid);
- if (!vlan) {
- seq_printf(seq,
- " * Cannot retrieve VLAN %d for originator %pM\n",
- batadv_print_vid(tt_common_entry->vid),
- best_entry->orig_node->orig);
- goto print_list;
- }
-
- last_ttvn = atomic_read(&best_entry->orig_node->last_ttvn);
- seq_printf(seq,
- " %c %pM %4i (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n",
- '*', tt_global_entry->common.addr,
- batadv_print_vid(tt_global_entry->common.vid),
- best_entry->ttvn, best_entry->orig_node->orig,
- last_ttvn, vlan->tt.crc,
- ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'),
- ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'),
- ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
- ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.'));
-
- batadv_orig_node_vlan_put(vlan);
- }
-
-print_list:
- head = &tt_global_entry->orig_list;
-
- hlist_for_each_entry_rcu(orig_entry, head, list) {
- if (best_entry == orig_entry)
- continue;
-
- vlan = batadv_orig_node_vlan_get(orig_entry->orig_node,
- tt_common_entry->vid);
- if (!vlan) {
- seq_printf(seq,
- " + Cannot retrieve VLAN %d for originator %pM\n",
- batadv_print_vid(tt_common_entry->vid),
- orig_entry->orig_node->orig);
- continue;
- }
-
- last_ttvn = atomic_read(&orig_entry->orig_node->last_ttvn);
- seq_printf(seq,
- " %c %pM %4d (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n",
- '+', tt_global_entry->common.addr,
- batadv_print_vid(tt_global_entry->common.vid),
- orig_entry->ttvn, orig_entry->orig_node->orig,
- last_ttvn, vlan->tt.crc,
- ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'),
- ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'),
- ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
- ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.'));
-
- batadv_orig_node_vlan_put(vlan);
- }
-}
-
-/**
- * batadv_tt_global_seq_print_text() - Print the global tt table in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->tt.global_hash;
- struct batadv_tt_common_entry *tt_common_entry;
- struct batadv_tt_global_entry *tt_global;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- u32 i;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- seq_printf(seq,
- "Globally announced TT entries received via the mesh %s\n",
- net_dev->name);
- seq_puts(seq,
- " Client VID (TTVN) Originator (Curr TTVN) (CRC ) Flags\n");
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(tt_common_entry,
- head, hash_entry) {
- tt_global = container_of(tt_common_entry,
- struct batadv_tt_global_entry,
- common);
- batadv_tt_global_print_entry(bat_priv, tt_global, seq);
- }
- rcu_read_unlock();
- }
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-#endif
-
/**
* batadv_tt_global_dump_subentry() - Dump all TT local entries into a message
* @msg: Netlink message to dump into
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index b24d35b9226a..57192c817229 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -11,7 +11,6 @@
#include <linux/netdevice.h>
#include <linux/netlink.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/types.h>
@@ -21,8 +20,6 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
u16 batadv_tt_local_remove(struct batadv_priv *bat_priv,
const u8 *addr, unsigned short vid,
const char *message, bool roaming);
-int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset);
-int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset);
int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb);
int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb);
void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index ed519efa3c36..2f96e96a5ca4 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -21,7 +21,6 @@
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/sched.h> /* for linux/wait.h */
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/timer.h>
@@ -187,9 +186,6 @@ struct batadv_hard_iface {
/** @net_dev: pointer to the net_device */
struct net_device *net_dev;
- /** @hardif_obj: kobject of the per interface sysfs "mesh" directory */
- struct kobject *hardif_obj;
-
/** @refcount: number of contexts the object is used */
struct kref refcount;
@@ -222,13 +218,6 @@ struct batadv_hard_iface {
struct batadv_hard_iface_bat_v bat_v;
#endif
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /**
- * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
- */
- struct dentry *debug_dir;
-#endif
-
/**
* @neigh_list: list of unique single hop neighbors via this interface
*/
@@ -1306,13 +1295,6 @@ struct batadv_priv_nc {
/** @work: work queue callback item for cleanup */
struct delayed_work work;
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /**
- * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
- */
- struct dentry *debug_dir;
-#endif
-
/**
* @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
*/
@@ -1492,7 +1474,7 @@ struct batadv_tp_vars {
/** @unacked_lock: protect unacked_list */
spinlock_t unacked_lock;
- /** @last_recv_time: time time (jiffies) a msg was received */
+ /** @last_recv_time: time (jiffies) a msg was received */
unsigned long last_recv_time;
/** @refcount: number of context where the object is used */
@@ -1512,9 +1494,6 @@ struct batadv_softif_vlan {
/** @vid: VLAN identifier */
unsigned short vid;
- /** @kobj: kobject for sysfs vlan subdirectory */
- struct kobject *kobj;
-
/** @ap_isolation: AP isolation state */
atomic_t ap_isolation; /* boolean */
@@ -1667,14 +1646,6 @@ struct batadv_priv {
/** @batman_queue_left: number of remaining OGM packet slots */
atomic_t batman_queue_left;
- /** @mesh_obj: kobject for sysfs mesh subdirectory */
- struct kobject *mesh_obj;
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /** @debug_dir: dentry for debugfs batman-adv subdirectory */
- struct dentry *debug_dir;
-#endif
-
/** @forw_bat_list: list of aggregated OGMs that will be forwarded */
struct hlist_head forw_bat_list;
@@ -1996,7 +1967,7 @@ struct batadv_tt_change_node {
*/
struct batadv_tt_req_node {
/**
- * @addr: mac address address of the originator this request was sent to
+ * @addr: mac address of the originator this request was sent to
*/
u8 addr[ETH_ALEN];
@@ -2234,11 +2205,6 @@ struct batadv_algo_neigh_ops {
struct batadv_neigh_node *neigh2,
struct batadv_hard_iface *if_outgoing2);
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /** @print: print the single hop neighbor list (optional) */
- void (*print)(struct batadv_priv *priv, struct seq_file *seq);
-#endif
-
/** @dump: dump neighbors to a netlink socket (optional) */
void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
struct batadv_priv *priv,
@@ -2249,12 +2215,6 @@ struct batadv_algo_neigh_ops {
* struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific)
*/
struct batadv_algo_orig_ops {
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /** @print: print the originator table (optional) */
- void (*print)(struct batadv_priv *priv, struct seq_file *seq,
- struct batadv_hard_iface *hard_iface);
-#endif
-
/** @dump: dump originators to a netlink socket (optional) */
void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
struct batadv_priv *priv,
@@ -2274,10 +2234,6 @@ struct batadv_algo_gw_ops {
*/
ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff,
size_t count);
-
- /** @show_sel_class: prints the current GW selection class (optional) */
- ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff);
-
/**
* @get_best_gw_node: select the best GW from the list of available
* nodes (optional)
@@ -2293,11 +2249,6 @@ struct batadv_algo_gw_ops {
struct batadv_orig_node *curr_gw_orig,
struct batadv_orig_node *orig_node);
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /** @print: print the gateway table (optional) */
- void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq);
-#endif
-
/** @dump: dump gateways to a netlink socket (optional) */
void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
struct batadv_priv *priv);
@@ -2456,21 +2407,4 @@ enum batadv_tvlv_handler_flags {
BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2),
};
-/**
- * struct batadv_store_mesh_work - Work queue item to detach add/del interface
- * from sysfs locks
- */
-struct batadv_store_mesh_work {
- /**
- * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
- */
- struct net_device *net_dev;
-
- /** @soft_iface_name: name of soft-interface to modify */
- char soft_iface_name[IFNAMSIZ];
-
- /** @work: work queue item */
- struct work_struct work;
-};
-
#endif /* _NET_BATMAN_ADV_TYPES_H_ */
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index e2497d764e97..64e669acd42f 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -64,7 +64,6 @@ source "net/bluetooth/hidp/Kconfig"
config BT_HS
bool "Bluetooth High Speed (HS) features"
depends on BT_BREDR
- default y
help
Bluetooth High Speed includes support for off-loading
Bluetooth connections via 802.11 (wifi) physical layer
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 26526be579c7..da7fd7c8c2dc 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -226,6 +226,9 @@ static int a2mp_discover_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
struct a2mp_info_req req;
found = true;
+
+ memset(&req, 0, sizeof(req));
+
req.id = cl->id;
a2mp_send(mgr, A2MP_GETINFO_REQ, __next_ident(mgr),
sizeof(req), &req);
@@ -305,6 +308,8 @@ static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb,
if (!hdev || hdev->dev_type != HCI_AMP) {
struct a2mp_info_rsp rsp;
+ memset(&rsp, 0, sizeof(rsp));
+
rsp.id = req->id;
rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
@@ -348,6 +353,8 @@ static int a2mp_getinfo_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
if (!ctrl)
return -ENOMEM;
+ memset(&req, 0, sizeof(req));
+
req.id = rsp->id;
a2mp_send(mgr, A2MP_GETAMPASSOC_REQ, __next_ident(mgr), sizeof(req),
&req);
@@ -376,6 +383,8 @@ static int a2mp_getampassoc_req(struct amp_mgr *mgr, struct sk_buff *skb,
struct a2mp_amp_assoc_rsp rsp;
rsp.id = req->id;
+ memset(&rsp, 0, sizeof(rsp));
+
if (tmp) {
rsp.status = A2MP_STATUS_COLLISION_OCCURED;
amp_mgr_put(tmp);
@@ -464,7 +473,6 @@ static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
struct a2mp_cmd *hdr)
{
struct a2mp_physlink_req *req = (void *) skb->data;
-
struct a2mp_physlink_rsp rsp;
struct hci_dev *hdev;
struct hci_conn *hcon;
@@ -475,6 +483,8 @@ static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
BT_DBG("local_id %d, remote_id %d", req->local_id, req->remote_id);
+ memset(&rsp, 0, sizeof(rsp));
+
rsp.local_id = req->remote_id;
rsp.remote_id = req->local_id;
@@ -553,6 +563,8 @@ static int a2mp_discphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
BT_DBG("local_id %d remote_id %d", req->local_id, req->remote_id);
+ memset(&rsp, 0, sizeof(rsp));
+
rsp.local_id = req->remote_id;
rsp.remote_id = req->local_id;
rsp.status = A2MP_STATUS_SUCCESS;
@@ -675,6 +687,8 @@ static int a2mp_chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
if (err) {
struct a2mp_cmd_rej rej;
+ memset(&rej, 0, sizeof(rej));
+
rej.reason = cpu_to_le16(0);
hdr = (void *) skb->data;
@@ -898,6 +912,8 @@ void a2mp_send_getinfo_rsp(struct hci_dev *hdev)
BT_DBG("%s mgr %p", hdev->name, mgr);
+ memset(&rsp, 0, sizeof(rsp));
+
rsp.id = hdev->id;
rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
@@ -995,6 +1011,8 @@ void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status)
if (!mgr)
return;
+ memset(&rsp, 0, sizeof(rsp));
+
hs_hcon = hci_conn_hash_lookup_state(hdev, AMP_LINK, BT_CONNECT);
if (!hs_hcon) {
rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
@@ -1027,6 +1045,8 @@ void a2mp_discover_amp(struct l2cap_chan *chan)
mgr->bredr_chan = chan;
+ memset(&req, 0, sizeof(req));
+
req.mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU);
req.ext_feat = 0;
a2mp_send(mgr, A2MP_DISCOVER_REQ, 1, sizeof(req), &req);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 9832f8445d43..4f1cd8063e72 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -758,6 +758,9 @@ static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
conn = hci_lookup_le_connect(hdev);
+ if (hdev->adv_instance_cnt)
+ hci_req_resume_adv_instances(hdev);
+
if (!status) {
hci_connect_le_scan_cleanup(conn);
goto done;
@@ -1067,10 +1070,11 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
* connections most controllers will refuse to connect if
* advertising is enabled, and for slave role connections we
* anyway have to disable it in order to start directed
- * advertising.
+ * advertising. Any registered advertisements will be
+ * re-enabled after the connection attempt is finished.
*/
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
- __hci_req_disable_advertising(&req);
+ __hci_req_pause_adv_instances(&req);
/* If requested to connect as slave use directed advertising */
if (conn->role == HCI_ROLE_SLAVE) {
@@ -1118,6 +1122,10 @@ create_conn:
err = hci_req_run(&req, create_le_conn_complete);
if (err) {
hci_conn_del(conn);
+
+ if (hdev->adv_instance_cnt)
+ hci_req_resume_adv_instances(hdev);
+
return ERR_PTR(err);
}
@@ -1388,7 +1396,7 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
return 0;
}
-/* Encrypt the the link */
+/* Encrypt the link */
static void hci_conn_encrypt(struct hci_conn *conn)
{
BT_DBG("hcon %p", conn);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 68bfe57b6625..9d2c9a1c552f 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -741,6 +741,12 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL);
}
+ if (hdev->commands[38] & 0x80) {
+ /* Read LE Min/Max Tx Power*/
+ hci_req_add(req, HCI_OP_LE_READ_TRANSMIT_POWER,
+ 0, NULL);
+ }
+
if (hdev->commands[26] & 0x40) {
/* Read LE White List Size */
hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE,
@@ -763,7 +769,7 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL);
}
- if (hdev->commands[35] & 0x40) {
+ if (hdev->commands[35] & 0x04) {
__le16 rpa_timeout = cpu_to_le16(hdev->rpa_timeout);
/* Set RPA timeout */
@@ -808,7 +814,7 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt)
* Delete Stored Link Key command. They are clearly indicating its
* absence in the bit mask of supported commands.
*
- * Check the supported commands and only if the the command is marked
+ * Check the supported commands and only if the command is marked
* as supported send it. If not supported assume that the controller
* does not have actual support for stored link keys which makes this
* command redundant anyway.
@@ -2951,7 +2957,8 @@ static void adv_instance_rpa_expired(struct work_struct *work)
int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
u16 adv_data_len, u8 *adv_data,
u16 scan_rsp_len, u8 *scan_rsp_data,
- u16 timeout, u16 duration)
+ u16 timeout, u16 duration, s8 tx_power,
+ u32 min_interval, u32 max_interval)
{
struct adv_info *adv_instance;
@@ -2963,7 +2970,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
sizeof(adv_instance->scan_rsp_data));
} else {
if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets ||
- instance < 1 || instance > HCI_MAX_ADV_INSTANCES)
+ instance < 1 || instance > hdev->le_num_of_adv_sets)
return -EOVERFLOW;
adv_instance = kzalloc(sizeof(*adv_instance), GFP_KERNEL);
@@ -2979,6 +2986,9 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
adv_instance->flags = flags;
adv_instance->adv_data_len = adv_data_len;
adv_instance->scan_rsp_len = scan_rsp_len;
+ adv_instance->min_interval = min_interval;
+ adv_instance->max_interval = max_interval;
+ adv_instance->tx_power = tx_power;
if (adv_data_len)
memcpy(adv_instance->adv_data, adv_data, adv_data_len);
@@ -2995,8 +3005,6 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
else
adv_instance->duration = duration;
- adv_instance->tx_power = HCI_TX_POWER_INVALID;
-
INIT_DELAYED_WORK(&adv_instance->rpa_expired_cb,
adv_instance_rpa_expired);
@@ -3006,6 +3014,37 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
}
/* This function requires the caller holds hdev->lock */
+int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance,
+ u16 adv_data_len, u8 *adv_data,
+ u16 scan_rsp_len, u8 *scan_rsp_data)
+{
+ struct adv_info *adv_instance;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+
+ /* If advertisement doesn't exist, we can't modify its data */
+ if (!adv_instance)
+ return -ENOENT;
+
+ if (adv_data_len) {
+ memset(adv_instance->adv_data, 0,
+ sizeof(adv_instance->adv_data));
+ memcpy(adv_instance->adv_data, adv_data, adv_data_len);
+ adv_instance->adv_data_len = adv_data_len;
+ }
+
+ if (scan_rsp_len) {
+ memset(adv_instance->scan_rsp_data, 0,
+ sizeof(adv_instance->scan_rsp_data));
+ memcpy(adv_instance->scan_rsp_data,
+ scan_rsp_data, scan_rsp_len);
+ adv_instance->scan_rsp_len = scan_rsp_len;
+ }
+
+ return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
void hci_adv_monitors_clear(struct hci_dev *hdev)
{
struct adv_monitor *monitor;
@@ -3061,6 +3100,7 @@ static int free_adv_monitor(int id, void *ptr, void *data)
idr_remove(&hdev->adv_monitors_idr, monitor->handle);
hci_free_adv_monitor(monitor);
+ hdev->adv_monitors_cnt--;
return 0;
}
@@ -3077,6 +3117,7 @@ int hci_remove_adv_monitor(struct hci_dev *hdev, u16 handle)
idr_remove(&hdev->adv_monitors_idr, monitor->handle);
hci_free_adv_monitor(monitor);
+ hdev->adv_monitors_cnt--;
} else {
/* Remove all monitors if handle is 0. */
idr_for_each(&hdev->adv_monitors_idr, &free_adv_monitor, hdev);
@@ -3442,6 +3483,16 @@ void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr,
}
}
+static void hci_suspend_clear_tasks(struct hci_dev *hdev)
+{
+ int i;
+
+ for (i = 0; i < __SUSPEND_NUM_TASKS; i++)
+ clear_bit(i, hdev->suspend_tasks);
+
+ wake_up(&hdev->suspend_wait_q);
+}
+
static int hci_suspend_wait_event(struct hci_dev *hdev)
{
#define WAKE_COND \
@@ -3487,12 +3538,24 @@ static int hci_change_suspend_state(struct hci_dev *hdev,
return hci_suspend_wait_event(hdev);
}
+static void hci_clear_wake_reason(struct hci_dev *hdev)
+{
+ hci_dev_lock(hdev);
+
+ hdev->wake_reason = 0;
+ bacpy(&hdev->wake_addr, BDADDR_ANY);
+ hdev->wake_addr_type = 0;
+
+ hci_dev_unlock(hdev);
+}
+
static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action,
void *data)
{
struct hci_dev *hdev =
container_of(nb, struct hci_dev, suspend_notifier);
int ret = 0;
+ u8 state = BT_RUNNING;
/* If powering down, wait for completion. */
if (mgmt_powering_down(hdev)) {
@@ -3513,15 +3576,27 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action,
* - Second, program event filter/whitelist and enable scan
*/
ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT);
+ if (!ret)
+ state = BT_SUSPEND_DISCONNECT;
/* Only configure whitelist if disconnect succeeded and wake
* isn't being prevented.
*/
- if (!ret && !(hdev->prevent_wake && hdev->prevent_wake(hdev)))
+ if (!ret && !(hdev->prevent_wake && hdev->prevent_wake(hdev))) {
ret = hci_change_suspend_state(hdev,
BT_SUSPEND_CONFIGURE_WAKE);
+ if (!ret)
+ state = BT_SUSPEND_CONFIGURE_WAKE;
+ }
+
+ hci_clear_wake_reason(hdev);
+ mgmt_suspending(hdev, state);
+
} else if (action == PM_POST_SUSPEND) {
ret = hci_change_suspend_state(hdev, BT_RUNNING);
+
+ mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr,
+ hdev->wake_addr_type);
}
done:
@@ -3556,6 +3631,10 @@ struct hci_dev *hci_alloc_dev(void)
hdev->cur_adv_instance = 0x00;
hdev->adv_instance_timeout = 0;
+ hdev->advmon_allowlist_duration = 300;
+ hdev->advmon_no_filter_duration = 500;
+ hdev->enable_advmon_interleave_scan = 0x00; /* Default to disable */
+
hdev->sniff_max_interval = 800;
hdev->sniff_min_interval = 80;
@@ -3587,6 +3666,8 @@ struct hci_dev *hci_alloc_dev(void)
hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES;
hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION;
hdev->def_le_autoconnect_timeout = HCI_LE_AUTOCONN_TIMEOUT;
+ hdev->min_le_tx_power = HCI_TX_POWER_INVALID;
+ hdev->max_le_tx_power = HCI_TX_POWER_INVALID;
hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT;
hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT;
@@ -3784,6 +3865,7 @@ void hci_unregister_dev(struct hci_dev *hdev)
cancel_work_sync(&hdev->power_on);
+ hci_suspend_clear_tasks(hdev);
unregister_pm_notifier(&hdev->suspend_notifier);
cancel_work_sync(&hdev->suspend_prepare);
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 5e8af2658e44..4626e0289a97 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -494,6 +494,45 @@ static int auto_accept_delay_get(void *data, u64 *val)
DEFINE_SIMPLE_ATTRIBUTE(auto_accept_delay_fops, auto_accept_delay_get,
auto_accept_delay_set, "%llu\n");
+static ssize_t force_bredr_smp_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t force_bredr_smp_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ bool enable;
+ int err;
+
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
+
+ err = smp_force_bredr(hdev, enable);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static const struct file_operations force_bredr_smp_fops = {
+ .open = simple_open,
+ .read = force_bredr_smp_read,
+ .write = force_bredr_smp_write,
+ .llseek = default_llseek,
+};
+
static int idle_timeout_set(void *data, u64 val)
{
struct hci_dev *hdev = data;
@@ -589,6 +628,17 @@ void hci_debugfs_create_bredr(struct hci_dev *hdev)
debugfs_create_file("voice_setting", 0444, hdev->debugfs, hdev,
&voice_setting_fops);
+ /* If the controller does not support BR/EDR Secure Connections
+ * feature, then the BR/EDR SMP channel shall not be present.
+ *
+ * To test this with Bluetooth 4.0 controllers, create a debugfs
+ * switch that allows forcing BR/EDR SMP support and accepting
+ * cross-transport pairing on non-AES encrypted connections.
+ */
+ if (!lmp_sc_capable(hdev))
+ debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
+ hdev, &force_bredr_smp_fops);
+
if (lmp_ssp_capable(hdev)) {
debugfs_create_file("ssp_debug_mode", 0444, hdev->debugfs,
hdev, &ssp_debug_mode_fops);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 4b7fc430793c..67668be3461e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1202,6 +1202,20 @@ static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev,
hci_dev_unlock(hdev);
}
+static void hci_cc_le_read_transmit_power(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_transmit_power *rp = (void *)skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->min_le_tx_power = rp->min_le_tx_power;
+ hdev->max_le_tx_power = rp->max_le_tx_power;
+}
+
static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
{
__u8 *sent, status = *((__u8 *) skb->data);
@@ -1752,6 +1766,7 @@ static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
}
/* Update adv data as tx power is known now */
hci_req_update_adv_data(hdev, hdev->cur_adv_instance);
+
hci_dev_unlock(hdev);
}
@@ -2569,7 +2584,6 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_ev_conn_complete *ev = (void *) skb->data;
- struct inquiry_entry *ie;
struct hci_conn *conn;
BT_DBG("%s", hdev->name);
@@ -2578,13 +2592,19 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
if (!conn) {
- /* Connection may not exist if auto-connected. Check the inquiry
- * cache to see if we've already discovered this bdaddr before.
- * If found and link is an ACL type, create a connection class
+ /* Connection may not exist if auto-connected. Check the bredr
+ * allowlist to see if this device is allowed to auto connect.
+ * If link is an ACL type, create a connection class
* automatically.
+ *
+ * Auto-connect will only occur if the event filter is
+ * programmed with a given address. Right now, event filter is
+ * only used during suspend.
*/
- ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
- if (ie && ev->link_type == ACL_LINK) {
+ if (ev->link_type == ACL_LINK &&
+ hci_bdaddr_list_lookup_with_flags(&hdev->whitelist,
+ &ev->bdaddr,
+ BDADDR_BREDR)) {
conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr,
HCI_ROLE_SLAVE);
if (!conn) {
@@ -3576,6 +3596,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_le_set_adv_set_random_addr(hdev, skb);
break;
+ case HCI_OP_LE_READ_TRANSMIT_POWER:
+ hci_cc_le_read_transmit_power(hdev, skb);
+ break;
+
default:
BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
break;
@@ -4931,15 +4955,15 @@ static void hci_phy_link_complete_evt(struct hci_dev *hdev,
hci_dev_lock(hdev);
hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
- if (!hcon) {
- hci_dev_unlock(hdev);
- return;
- }
+ if (!hcon)
+ goto unlock;
+
+ if (!hcon->amp_mgr)
+ goto unlock;
if (ev->status) {
hci_conn_del(hcon);
- hci_dev_unlock(hdev);
- return;
+ goto unlock;
}
bredr_hcon = hcon->amp_mgr->l2cap_conn->hcon;
@@ -4956,6 +4980,7 @@ static void hci_phy_link_complete_evt(struct hci_dev *hdev,
amp_physical_cfm(bredr_hcon, hcon);
+unlock:
hci_dev_unlock(hdev);
}
@@ -5863,21 +5888,19 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev,
struct sk_buff *skb)
{
u8 num_reports = skb->data[0];
- void *ptr = &skb->data[1];
+ struct hci_ev_le_direct_adv_info *ev = (void *)&skb->data[1];
- hci_dev_lock(hdev);
+ if (!num_reports || skb->len < num_reports * sizeof(*ev) + 1)
+ return;
- while (num_reports--) {
- struct hci_ev_le_direct_adv_info *ev = ptr;
+ hci_dev_lock(hdev);
+ for (; num_reports; num_reports--, ev++)
process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
ev->bdaddr_type, &ev->direct_addr,
ev->direct_addr_type, ev->rssi, NULL, 0,
false);
- ptr += sizeof(*ev);
- }
-
hci_dev_unlock(hdev);
}
@@ -6012,6 +6035,75 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
return true;
}
+static void hci_store_wake_reason(struct hci_dev *hdev, u8 event,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_advertising_info *adv;
+ struct hci_ev_le_direct_adv_info *direct_adv;
+ struct hci_ev_le_ext_adv_report *ext_adv;
+ const struct hci_ev_conn_complete *conn_complete = (void *)skb->data;
+ const struct hci_ev_conn_request *conn_request = (void *)skb->data;
+
+ hci_dev_lock(hdev);
+
+ /* If we are currently suspended and this is the first BT event seen,
+ * save the wake reason associated with the event.
+ */
+ if (!hdev->suspended || hdev->wake_reason)
+ goto unlock;
+
+ /* Default to remote wake. Values for wake_reason are documented in the
+ * Bluez mgmt api docs.
+ */
+ hdev->wake_reason = MGMT_WAKE_REASON_REMOTE_WAKE;
+
+ /* Once configured for remote wakeup, we should only wake up for
+ * reconnections. It's useful to see which device is waking us up so
+ * keep track of the bdaddr of the connection event that woke us up.
+ */
+ if (event == HCI_EV_CONN_REQUEST) {
+ bacpy(&hdev->wake_addr, &conn_complete->bdaddr);
+ hdev->wake_addr_type = BDADDR_BREDR;
+ } else if (event == HCI_EV_CONN_COMPLETE) {
+ bacpy(&hdev->wake_addr, &conn_request->bdaddr);
+ hdev->wake_addr_type = BDADDR_BREDR;
+ } else if (event == HCI_EV_LE_META) {
+ struct hci_ev_le_meta *le_ev = (void *)skb->data;
+ u8 subevent = le_ev->subevent;
+ u8 *ptr = &skb->data[sizeof(*le_ev)];
+ u8 num_reports = *ptr;
+
+ if ((subevent == HCI_EV_LE_ADVERTISING_REPORT ||
+ subevent == HCI_EV_LE_DIRECT_ADV_REPORT ||
+ subevent == HCI_EV_LE_EXT_ADV_REPORT) &&
+ num_reports) {
+ adv = (void *)(ptr + 1);
+ direct_adv = (void *)(ptr + 1);
+ ext_adv = (void *)(ptr + 1);
+
+ switch (subevent) {
+ case HCI_EV_LE_ADVERTISING_REPORT:
+ bacpy(&hdev->wake_addr, &adv->bdaddr);
+ hdev->wake_addr_type = adv->bdaddr_type;
+ break;
+ case HCI_EV_LE_DIRECT_ADV_REPORT:
+ bacpy(&hdev->wake_addr, &direct_adv->bdaddr);
+ hdev->wake_addr_type = direct_adv->bdaddr_type;
+ break;
+ case HCI_EV_LE_EXT_ADV_REPORT:
+ bacpy(&hdev->wake_addr, &ext_adv->bdaddr);
+ hdev->wake_addr_type = ext_adv->bdaddr_type;
+ break;
+ }
+ }
+ } else {
+ hdev->wake_reason = MGMT_WAKE_REASON_UNEXPECTED;
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_event_hdr *hdr = (void *) skb->data;
@@ -6045,6 +6137,9 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
skb_pull(skb, HCI_EVENT_HDR_SIZE);
+ /* Store wake reason if we're suspended */
+ hci_store_wake_reason(hdev, event, skb);
+
switch (event) {
case HCI_EV_INQUIRY_COMPLETE:
hci_inquiry_complete_evt(hdev, skb);
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index e0269192f2e5..71bffd745472 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -58,7 +58,7 @@ static int req_run(struct hci_request *req, hci_req_complete_t complete,
struct sk_buff *skb;
unsigned long flags;
- BT_DBG("length %u", skb_queue_len(&req->cmd_q));
+ bt_dev_dbg(hdev, "length %u", skb_queue_len(&req->cmd_q));
/* If an error occurred during request building, remove all HCI
* commands queued on the HCI request queue.
@@ -102,7 +102,7 @@ int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete)
static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
struct sk_buff *skb)
{
- BT_DBG("%s result 0x%2.2x", hdev->name, result);
+ bt_dev_dbg(hdev, "result 0x%2.2x", result);
if (hdev->req_status == HCI_REQ_PEND) {
hdev->req_result = result;
@@ -115,7 +115,7 @@ static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
void hci_req_sync_cancel(struct hci_dev *hdev, int err)
{
- BT_DBG("%s err 0x%2.2x", hdev->name, err);
+ bt_dev_dbg(hdev, "err 0x%2.2x", err);
if (hdev->req_status == HCI_REQ_PEND) {
hdev->req_result = err;
@@ -131,7 +131,7 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
struct sk_buff *skb;
int err = 0;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_req_init(&req, hdev);
@@ -167,7 +167,7 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
skb = hdev->req_skb;
hdev->req_skb = NULL;
- BT_DBG("%s end: err %d", hdev->name, err);
+ bt_dev_dbg(hdev, "end: err %d", err);
if (err < 0) {
kfree_skb(skb);
@@ -196,7 +196,7 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
struct hci_request req;
int err = 0;
- BT_DBG("%s start", hdev->name);
+ bt_dev_dbg(hdev, "start");
hci_req_init(&req, hdev);
@@ -260,7 +260,7 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
hdev->req_skb = NULL;
hdev->req_status = hdev->req_result = 0;
- BT_DBG("%s end: err %d", hdev->name, err);
+ bt_dev_dbg(hdev, "end: err %d", err);
return err;
}
@@ -300,7 +300,7 @@ struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
if (plen)
skb_put_data(skb, param, plen);
- BT_DBG("skb len %d", skb->len);
+ bt_dev_dbg(hdev, "skb len %d", skb->len);
hci_skb_pkt_type(skb) = HCI_COMMAND_PKT;
hci_skb_opcode(skb) = opcode;
@@ -315,7 +315,7 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
struct hci_dev *hdev = req->hdev;
struct sk_buff *skb;
- BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen);
+ bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen);
/* If an error occurred during request building, there is no point in
* queueing the HCI command. We can simply return.
@@ -378,6 +378,53 @@ void __hci_req_write_fast_connectable(struct hci_request *req, bool enable)
hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type);
}
+static void start_interleave_scan(struct hci_dev *hdev)
+{
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER;
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->interleave_scan, 0);
+}
+
+static bool is_interleave_scanning(struct hci_dev *hdev)
+{
+ return hdev->interleave_scan_state != INTERLEAVE_SCAN_NONE;
+}
+
+static void cancel_interleave_scan(struct hci_dev *hdev)
+{
+ bt_dev_dbg(hdev, "cancelling interleave scan");
+
+ cancel_delayed_work_sync(&hdev->interleave_scan);
+
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_NONE;
+}
+
+/* Return true if interleave_scan wasn't started until exiting this function,
+ * otherwise, return false
+ */
+static bool __hci_update_interleaved_scan(struct hci_dev *hdev)
+{
+ /* If there is at least one ADV monitors and one pending LE connection
+ * or one device to be scanned for, we should alternate between
+ * allowlist scan and one without any filters to save power.
+ */
+ bool use_interleaving = hci_is_adv_monitoring(hdev) &&
+ !(list_empty(&hdev->pend_le_conns) &&
+ list_empty(&hdev->pend_le_reports));
+ bool is_interleaving = is_interleave_scanning(hdev);
+
+ if (use_interleaving && !is_interleaving) {
+ start_interleave_scan(hdev);
+ bt_dev_dbg(hdev, "starting interleave scan");
+ return true;
+ }
+
+ if (!use_interleaving && is_interleaving)
+ cancel_interleave_scan(hdev);
+
+ return false;
+}
+
/* This function controls the background scanning based on hdev->pend_le_conns
* list. If there are pending LE connection we start the background scanning,
* otherwise we stop it.
@@ -413,8 +460,8 @@ static void __hci_update_background_scan(struct hci_request *req)
*/
hci_discovery_filter_clear(hdev);
- BT_DBG("%s ADV monitoring is %s", hdev->name,
- hci_is_adv_monitoring(hdev) ? "on" : "off");
+ bt_dev_dbg(hdev, "ADV monitoring is %s",
+ hci_is_adv_monitoring(hdev) ? "on" : "off");
if (list_empty(&hdev->pend_le_conns) &&
list_empty(&hdev->pend_le_reports) &&
@@ -430,7 +477,7 @@ static void __hci_update_background_scan(struct hci_request *req)
hci_req_add_le_scan_disable(req, false);
- BT_DBG("%s stopping background scanning", hdev->name);
+ bt_dev_dbg(hdev, "stopping background scanning");
} else {
/* If there is at least one pending LE connection, we should
* keep the background scan running.
@@ -450,8 +497,7 @@ static void __hci_update_background_scan(struct hci_request *req)
hci_req_add_le_scan_disable(req, false);
hci_req_add_le_passive_scan(req);
-
- BT_DBG("%s starting background scanning", hdev->name);
+ bt_dev_dbg(hdev, "starting background scanning");
}
}
@@ -661,6 +707,9 @@ void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn)
return;
}
+ if (hdev->suspended)
+ set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks);
+
if (use_ext_scan(hdev)) {
struct hci_cp_le_set_ext_scan_enable cp;
@@ -698,7 +747,8 @@ static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr,
cp.bdaddr_type);
hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, sizeof(cp), &cp);
- if (use_ll_privacy(req->hdev)) {
+ if (use_ll_privacy(req->hdev) &&
+ hci_dev_test_flag(req->hdev, HCI_ENABLE_LL_PRIVACY)) {
struct smp_irk *irk;
irk = hci_find_irk_by_addr(req->hdev, bdaddr, bdaddr_type);
@@ -732,7 +782,8 @@ static int add_to_white_list(struct hci_request *req,
return -1;
/* White list can not be used with RPAs */
- if (!allow_rpa && !use_ll_privacy(hdev) &&
+ if (!allow_rpa &&
+ !hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) &&
hci_find_irk_by_addr(hdev, &params->addr, params->addr_type)) {
return -1;
}
@@ -750,7 +801,8 @@ static int add_to_white_list(struct hci_request *req,
cp.bdaddr_type);
hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp);
- if (use_ll_privacy(hdev)) {
+ if (use_ll_privacy(hdev) &&
+ hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) {
struct smp_irk *irk;
irk = hci_find_irk_by_addr(hdev, &params->addr,
@@ -812,7 +864,8 @@ static u8 update_white_list(struct hci_request *req)
}
/* White list can not be used with RPAs */
- if (!allow_rpa && !use_ll_privacy(hdev) &&
+ if (!allow_rpa &&
+ !hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) &&
hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) {
return 0x00;
}
@@ -844,12 +897,17 @@ static u8 update_white_list(struct hci_request *req)
return 0x00;
}
- /* Once the controller offloading of advertisement monitor is in place,
- * the if condition should include the support of MSFT extension
- * support. If suspend is ongoing, whitelist should be the default to
- * prevent waking by random advertisements.
+ /* Use the allowlist unless the following conditions are all true:
+ * - We are not currently suspending
+ * - There are 1 or more ADV monitors registered
+ * - Interleaved scanning is not currently using the allowlist
+ *
+ * Once the controller offloading of advertisement monitor is in place,
+ * the above condition should include the support of MSFT extension
+ * support.
*/
- if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended)
+ if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended &&
+ hdev->interleave_scan_state != INTERLEAVE_SCAN_ALLOWLIST)
return 0x00;
/* Select filter policy to use white list */
@@ -1002,6 +1060,11 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
&own_addr_type))
return;
+ if (hdev->enable_advmon_interleave_scan &&
+ __hci_update_interleaved_scan(hdev))
+ return;
+
+ bt_dev_dbg(hdev, "interleave state %d", hdev->interleave_scan_state);
/* Adding or removing entries from the white list must
* happen before enabling scanning. The controller does
* not allow white list modification while scanning.
@@ -1027,6 +1090,9 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
} else if (hci_is_le_conn_scanning(hdev)) {
window = hdev->le_scan_window_connect;
interval = hdev->le_scan_int_connect;
+ } else if (hci_is_adv_monitoring(hdev)) {
+ window = hdev->le_scan_window_adv_monitor;
+ interval = hdev->le_scan_int_adv_monitor;
} else {
window = hdev->le_scan_window;
interval = hdev->le_scan_interval;
@@ -1037,22 +1103,23 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
own_addr_type, filter_policy, addr_resolv);
}
-static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance)
+static bool adv_instance_is_scannable(struct hci_dev *hdev, u8 instance)
{
struct adv_info *adv_instance;
/* Instance 0x00 always set local name */
if (instance == 0x00)
- return 1;
+ return true;
adv_instance = hci_find_adv_instance(hdev, instance);
if (!adv_instance)
- return 0;
+ return false;
- /* TODO: Take into account the "appearance" and "local-name" flags here.
- * These are currently being ignored as they are not supported.
- */
- return adv_instance->scan_rsp_len;
+ if (adv_instance->flags & MGMT_ADV_FLAG_APPEARANCE ||
+ adv_instance->flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ return true;
+
+ return adv_instance->scan_rsp_len ? true : false;
}
static void hci_req_clear_event_filter(struct hci_request *req)
@@ -1095,6 +1162,11 @@ static void hci_req_set_event_filter(struct hci_request *req)
scan = SCAN_PAGE;
}
+ if (scan)
+ set_bit(SUSPEND_SCAN_ENABLE, hdev->suspend_tasks);
+ else
+ set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks);
+
hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
}
@@ -1111,6 +1183,64 @@ static void hci_req_config_le_suspend_scan(struct hci_request *req)
set_bit(SUSPEND_SCAN_ENABLE, req->hdev->suspend_tasks);
}
+static void cancel_adv_timeout(struct hci_dev *hdev)
+{
+ if (hdev->adv_instance_timeout) {
+ hdev->adv_instance_timeout = 0;
+ cancel_delayed_work(&hdev->adv_instance_expire);
+ }
+}
+
+/* This function requires the caller holds hdev->lock */
+void __hci_req_pause_adv_instances(struct hci_request *req)
+{
+ bt_dev_dbg(req->hdev, "Pausing advertising instances");
+
+ /* Call to disable any advertisements active on the controller.
+ * This will succeed even if no advertisements are configured.
+ */
+ __hci_req_disable_advertising(req);
+
+ /* If we are using software rotation, pause the loop */
+ if (!ext_adv_capable(req->hdev))
+ cancel_adv_timeout(req->hdev);
+}
+
+/* This function requires the caller holds hdev->lock */
+static void __hci_req_resume_adv_instances(struct hci_request *req)
+{
+ struct adv_info *adv;
+
+ bt_dev_dbg(req->hdev, "Resuming advertising instances");
+
+ if (ext_adv_capable(req->hdev)) {
+ /* Call for each tracked instance to be re-enabled */
+ list_for_each_entry(adv, &req->hdev->adv_instances, list) {
+ __hci_req_enable_ext_advertising(req,
+ adv->instance);
+ }
+
+ } else {
+ /* Schedule for most recent instance to be restarted and begin
+ * the software rotation loop
+ */
+ __hci_req_schedule_adv_instance(req,
+ req->hdev->cur_adv_instance,
+ true);
+ }
+}
+
+/* This function requires the caller holds hdev->lock */
+int hci_req_resume_adv_instances(struct hci_dev *hdev)
+{
+ struct hci_request req;
+
+ hci_req_init(&req, hdev);
+ __hci_req_resume_adv_instances(&req);
+
+ return hci_req_run(&req, NULL);
+}
+
static void suspend_req_complete(struct hci_dev *hdev, u8 status, u16 opcode)
{
bt_dev_dbg(hdev, "Request complete opcode=0x%x, status=0x%x", opcode,
@@ -1153,7 +1283,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next)
hdev->discovery_paused = true;
hdev->discovery_old_state = old_state;
- /* Stop advertising */
+ /* Stop directed advertising */
old_state = hci_dev_test_flag(hdev, HCI_ADVERTISING);
if (old_state) {
set_bit(SUSPEND_PAUSE_ADVERTISING, hdev->suspend_tasks);
@@ -1162,6 +1292,10 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next)
&hdev->discov_off, 0);
}
+ /* Pause other advertisements */
+ if (hdev->adv_instance_cnt)
+ __hci_req_pause_adv_instances(&req);
+
hdev->advertising_paused = true;
hdev->advertising_old_state = old_state;
/* Disable page scan */
@@ -1169,8 +1303,10 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next)
hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &page_scan);
/* Disable LE passive scan if enabled */
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ cancel_interleave_scan(hdev);
hci_req_add_le_scan_disable(&req, false);
+ }
/* Mark task needing completion */
set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks);
@@ -1212,7 +1348,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next)
/* Reset passive/background scanning to normal */
hci_req_config_le_suspend_scan(&req);
- /* Unpause advertising */
+ /* Unpause directed advertising */
hdev->advertising_paused = false;
if (hdev->advertising_old_state) {
set_bit(SUSPEND_UNPAUSE_ADVERTISING,
@@ -1223,6 +1359,10 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next)
hdev->advertising_old_state = 0;
}
+ /* Resume other advertisements */
+ if (hdev->adv_instance_cnt)
+ __hci_req_resume_adv_instances(&req);
+
/* Unpause discovery */
hdev->discovery_paused = false;
if (hdev->discovery_old_state != DISCOVERY_STOPPED &&
@@ -1242,23 +1382,9 @@ done:
wake_up(&hdev->suspend_wait_q);
}
-static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
+static bool adv_cur_instance_is_scannable(struct hci_dev *hdev)
{
- u8 instance = hdev->cur_adv_instance;
- struct adv_info *adv_instance;
-
- /* Instance 0x00 always set local name */
- if (instance == 0x00)
- return 1;
-
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return 0;
-
- /* TODO: Take into account the "appearance" and "local-name" flags here.
- * These are currently being ignored as they are not supported.
- */
- return adv_instance->scan_rsp_len;
+ return adv_instance_is_scannable(hdev, hdev->cur_adv_instance);
}
void __hci_req_disable_advertising(struct hci_request *req)
@@ -1370,6 +1496,7 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable)
void __hci_req_enable_advertising(struct hci_request *req)
{
struct hci_dev *hdev = req->hdev;
+ struct adv_info *adv_instance;
struct hci_cp_le_set_adv_param cp;
u8 own_addr_type, enable = 0x01;
bool connectable;
@@ -1377,6 +1504,7 @@ void __hci_req_enable_advertising(struct hci_request *req)
u32 flags;
flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance);
+ adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
/* If the "connectable" instance flag was not set, then choose between
* ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
@@ -1408,13 +1536,18 @@ void __hci_req_enable_advertising(struct hci_request *req)
memset(&cp, 0, sizeof(cp));
- if (connectable) {
- cp.type = LE_ADV_IND;
-
+ if (adv_instance) {
+ adv_min_interval = adv_instance->min_interval;
+ adv_max_interval = adv_instance->max_interval;
+ } else {
adv_min_interval = hdev->le_adv_min_interval;
adv_max_interval = hdev->le_adv_max_interval;
+ }
+
+ if (connectable) {
+ cp.type = LE_ADV_IND;
} else {
- if (get_cur_adv_instance_scan_rsp_len(hdev))
+ if (adv_cur_instance_is_scannable(hdev))
cp.type = LE_ADV_SCAN_IND;
else
cp.type = LE_ADV_NONCONN_IND;
@@ -1423,9 +1556,6 @@ void __hci_req_enable_advertising(struct hci_request *req)
hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) {
adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN;
adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX;
- } else {
- adv_min_interval = hdev->le_adv_min_interval;
- adv_max_interval = hdev->le_adv_max_interval;
}
}
@@ -1750,7 +1880,7 @@ void hci_req_disable_address_resolution(struct hci_dev *hdev)
static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
{
- BT_DBG("%s status %u", hdev->name, status);
+ bt_dev_dbg(hdev, "status %u", status);
}
void hci_req_reenable_advertising(struct hci_dev *hdev)
@@ -1787,7 +1917,7 @@ static void adv_timeout_expire(struct work_struct *work)
struct hci_request req;
u8 instance;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
@@ -1810,6 +1940,62 @@ unlock:
hci_dev_unlock(hdev);
}
+static int hci_req_add_le_interleaved_scan(struct hci_request *req,
+ unsigned long opt)
+{
+ struct hci_dev *hdev = req->hdev;
+ int ret = 0;
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ hci_req_add_le_scan_disable(req, false);
+ hci_req_add_le_passive_scan(req);
+
+ switch (hdev->interleave_scan_state) {
+ case INTERLEAVE_SCAN_ALLOWLIST:
+ bt_dev_dbg(hdev, "next state: allowlist");
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER;
+ break;
+ case INTERLEAVE_SCAN_NO_FILTER:
+ bt_dev_dbg(hdev, "next state: no filter");
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_ALLOWLIST;
+ break;
+ case INTERLEAVE_SCAN_NONE:
+ BT_ERR("unexpected error");
+ ret = -1;
+ }
+
+ hci_dev_unlock(hdev);
+
+ return ret;
+}
+
+static void interleave_scan_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ interleave_scan.work);
+ u8 status;
+ unsigned long timeout;
+
+ if (hdev->interleave_scan_state == INTERLEAVE_SCAN_ALLOWLIST) {
+ timeout = msecs_to_jiffies(hdev->advmon_allowlist_duration);
+ } else if (hdev->interleave_scan_state == INTERLEAVE_SCAN_NO_FILTER) {
+ timeout = msecs_to_jiffies(hdev->advmon_no_filter_duration);
+ } else {
+ bt_dev_err(hdev, "unexpected error");
+ return;
+ }
+
+ hci_req_sync(hdev, hci_req_add_le_interleaved_scan, 0,
+ HCI_CMD_TIMEOUT, &status);
+
+ /* Don't continue interleaving if it was canceled */
+ if (is_interleave_scanning(hdev))
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->interleave_scan, timeout);
+}
+
int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
bool use_rpa, struct adv_info *adv_instance,
u8 *own_addr_type, bdaddr_t *rand_addr)
@@ -1824,7 +2010,13 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
if (use_rpa) {
int to;
- *own_addr_type = ADDR_LE_DEV_RANDOM;
+ /* If Controller supports LL Privacy use own address type is
+ * 0x03
+ */
+ if (use_ll_privacy(hdev))
+ *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
+ else
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
if (adv_instance) {
if (!adv_instance->rpa_expired &&
@@ -1939,9 +2131,15 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
memset(&cp, 0, sizeof(cp));
- /* In ext adv set param interval is 3 octets */
- hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval);
- hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval);
+ if (adv_instance) {
+ hci_cpu_to_le24(adv_instance->min_interval, cp.min_interval);
+ hci_cpu_to_le24(adv_instance->max_interval, cp.max_interval);
+ cp.tx_power = adv_instance->tx_power;
+ } else {
+ hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval);
+ hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval);
+ cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE;
+ }
secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK);
@@ -1950,7 +2148,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND);
else
cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND);
- } else if (get_adv_instance_scan_rsp_len(hdev, instance)) {
+ } else if (adv_instance_is_scannable(hdev, instance)) {
if (secondary_adv)
cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND);
else
@@ -1964,7 +2162,6 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
cp.own_addr_type = own_addr_type;
cp.channel_map = hdev->le_adv_channel_map;
- cp.tx_power = 127;
cp.handle = instance;
if (flags & MGMT_ADV_FLAG_SEC_2M) {
@@ -2183,14 +2380,6 @@ int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
return 0;
}
-static void cancel_adv_timeout(struct hci_dev *hdev)
-{
- if (hdev->adv_instance_timeout) {
- hdev->adv_instance_timeout = 0;
- cancel_delayed_work(&hdev->adv_instance_expire);
- }
-}
-
/* For a single instance:
* - force == true: The instance will be removed even when its remaining
* lifetime is not zero.
@@ -2273,7 +2462,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
*/
if (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
hci_lookup_le_connect(hdev)) {
- BT_DBG("Deferring random address update");
+ bt_dev_dbg(hdev, "Deferring random address update");
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
return;
}
@@ -2498,7 +2687,7 @@ void __hci_req_update_class(struct hci_request *req)
struct hci_dev *hdev = req->hdev;
u8 cod[3];
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
if (!hdev_is_powered(hdev))
return;
@@ -2667,7 +2856,7 @@ void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
static void abort_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
{
if (status)
- BT_DBG("Failed to abort connection: status 0x%2.2x", status);
+ bt_dev_dbg(hdev, "Failed to abort connection: status 0x%2.2x", status);
}
int hci_abort_conn(struct hci_conn *conn, u8 reason)
@@ -2730,7 +2919,7 @@ static int bredr_inquiry(struct hci_request *req, unsigned long opt)
const u8 liac[3] = { 0x00, 0x8b, 0x9e };
struct hci_cp_inquiry cp;
- BT_DBG("%s", req->hdev->name);
+ bt_dev_dbg(req->hdev, "");
hci_dev_lock(req->hdev);
hci_inquiry_cache_flush(req->hdev);
@@ -2756,7 +2945,7 @@ static void le_scan_disable_work(struct work_struct *work)
le_scan_disable.work);
u8 status;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
return;
@@ -2852,7 +3041,7 @@ static void le_scan_restart_work(struct work_struct *work)
unsigned long timeout, duration, scan_start, now;
u8 status;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_req_sync(hdev, le_scan_restart, 0, HCI_CMD_TIMEOUT, &status);
if (status) {
@@ -2906,14 +3095,16 @@ static int active_scan(struct hci_request *req, unsigned long opt)
bool addr_resolv = false;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
/* If controller is scanning, it means the background scanning is
* running. Thus, we should temporarily stop it in order to set the
* discovery scanning parameters.
*/
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
hci_req_add_le_scan_disable(req, false);
+ cancel_interleave_scan(hdev);
+ }
/* All active scans will be done with either a resolvable private
* address (when privacy feature has been enabled) or non-resolvable
@@ -2934,7 +3125,7 @@ static int interleaved_discov(struct hci_request *req, unsigned long opt)
{
int err;
- BT_DBG("%s", req->hdev->name);
+ bt_dev_dbg(req->hdev, "");
err = active_scan(req, opt);
if (err)
@@ -2947,7 +3138,7 @@ static void start_discovery(struct hci_dev *hdev, u8 *status)
{
unsigned long timeout;
- BT_DBG("%s type %u", hdev->name, hdev->discovery.type);
+ bt_dev_dbg(hdev, "type %u", hdev->discovery.type);
switch (hdev->discovery.type) {
case DISCOV_TYPE_BREDR:
@@ -2995,7 +3186,7 @@ static void start_discovery(struct hci_dev *hdev, u8 *status)
if (*status)
return;
- BT_DBG("%s timeout %u ms", hdev->name, jiffies_to_msecs(timeout));
+ bt_dev_dbg(hdev, "timeout %u ms", jiffies_to_msecs(timeout));
/* When service discovery is used and the controller has a
* strict duplicate filter, it is important to remember the
@@ -3020,7 +3211,7 @@ bool hci_req_stop_discovery(struct hci_request *req)
struct inquiry_entry *e;
bool ret = false;
- BT_DBG("%s state %u", hdev->name, hdev->discovery.state);
+ bt_dev_dbg(hdev, "state %u", hdev->discovery.state);
if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) {
if (test_bit(HCI_INQUIRY, &hdev->flags))
@@ -3100,7 +3291,7 @@ static void discov_off(struct work_struct *work)
struct hci_dev *hdev = container_of(work, struct hci_dev,
discov_off.work);
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
@@ -3239,6 +3430,7 @@ void hci_request_setup(struct hci_dev *hdev)
INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work);
INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work);
INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
+ INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work);
}
void hci_request_cancel_all(struct hci_dev *hdev)
@@ -3258,4 +3450,6 @@ void hci_request_cancel_all(struct hci_dev *hdev)
cancel_delayed_work_sync(&hdev->adv_instance_expire);
hdev->adv_instance_timeout = 0;
}
+
+ cancel_interleave_scan(hdev);
}
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 6a12e84c66c4..39ee8a18087a 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -71,6 +71,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req);
void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next);
void hci_req_disable_address_resolution(struct hci_dev *hdev);
+void __hci_req_pause_adv_instances(struct hci_request *req);
+int hci_req_resume_adv_instances(struct hci_dev *hdev);
void hci_req_reenable_advertising(struct hci_dev *hdev);
void __hci_req_enable_advertising(struct hci_request *req);
void __hci_req_disable_advertising(struct hci_request *req);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 3b4fa27a44e6..0db48c812662 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1290,7 +1290,7 @@ static int hidp_session_thread(void *arg)
/* cleanup runtime environment */
remove_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait);
- remove_wait_queue(sk_sleep(session->intr_sock->sk), &ctrl_wait);
+ remove_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait);
wake_up_interruptible(&session->report_queue);
hidp_del_timer(session);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index ade83e224567..17b87b57a175 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1515,8 +1515,14 @@ static bool l2cap_check_enc_key_size(struct hci_conn *hcon)
* that have no key size requirements. Ensure that the link is
* actually encrypted before enforcing a key size.
*/
+ int min_key_size = hcon->hdev->min_enc_key_size;
+
+ /* On FIPS security level, key size must be 16 bytes */
+ if (hcon->sec_level == BT_SECURITY_FIPS)
+ min_key_size = 16;
+
return (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags) ||
- hcon->enc_key_size >= hcon->hdev->min_enc_key_size);
+ hcon->enc_key_size >= min_key_size);
}
static void l2cap_do_start(struct l2cap_chan *chan)
@@ -3627,7 +3633,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
if (hint)
break;
result = L2CAP_CONF_UNKNOWN;
- *((u8 *) ptr++) = type;
+ l2cap_add_conf_opt(&ptr, (u8)type, sizeof(u8), type, endptr - ptr);
break;
}
}
@@ -7301,9 +7307,10 @@ static int l2cap_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
goto drop;
}
- if ((chan->mode == L2CAP_MODE_ERTM ||
- chan->mode == L2CAP_MODE_STREAMING) && sk_filter(chan->data, skb))
- goto drop;
+ if (chan->ops->filter) {
+ if (chan->ops->filter(chan, skb))
+ goto drop;
+ }
if (!control->sframe) {
int err;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index e1a3e66b1754..f1b1edd0b697 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1521,8 +1521,6 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err)
parent = bt_sk(sk)->parent;
- sock_set_flag(sk, SOCK_ZAPPED);
-
switch (chan->state) {
case BT_OPEN:
case BT_BOUND:
@@ -1549,8 +1547,11 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err)
break;
}
-
release_sock(sk);
+
+ /* Only zap after cleanup to avoid use after free race */
+ sock_set_flag(sk, SOCK_ZAPPED);
+
}
static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state,
@@ -1663,6 +1664,19 @@ static void l2cap_sock_suspend_cb(struct l2cap_chan *chan)
sk->sk_state_change(sk);
}
+static int l2cap_sock_filter(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct sock *sk = chan->data;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ return sk_filter(sk, skb);
+ }
+
+ return 0;
+}
+
static const struct l2cap_ops l2cap_chan_ops = {
.name = "L2CAP Socket Interface",
.new_connection = l2cap_sock_new_connection_cb,
@@ -1678,6 +1692,7 @@ static const struct l2cap_ops l2cap_chan_ops = {
.get_sndtimeo = l2cap_sock_get_sndtimeo_cb,
.get_peer_pid = l2cap_sock_get_peer_pid_cb,
.alloc_skb = l2cap_sock_alloc_skb_cb,
+ .filter = l2cap_sock_filter,
};
static void l2cap_sock_destruct(struct sock *sk)
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 5bbe71002fb9..fa0f7a4a1d2f 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -40,7 +40,7 @@
#include "msft.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 18
+#define MGMT_REVISION 19
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -110,7 +110,7 @@ static const u16 mgmt_commands[] = {
MGMT_OP_SET_APPEARANCE,
MGMT_OP_SET_BLOCKED_KEYS,
MGMT_OP_SET_WIDEBAND_SPEECH,
- MGMT_OP_READ_SECURITY_INFO,
+ MGMT_OP_READ_CONTROLLER_CAP,
MGMT_OP_READ_EXP_FEATURES_INFO,
MGMT_OP_SET_EXP_FEATURE,
MGMT_OP_READ_DEF_SYSTEM_CONFIG,
@@ -122,6 +122,8 @@ static const u16 mgmt_commands[] = {
MGMT_OP_READ_ADV_MONITOR_FEATURES,
MGMT_OP_ADD_ADV_PATTERNS_MONITOR,
MGMT_OP_REMOVE_ADV_MONITOR,
+ MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_OP_ADD_EXT_ADV_DATA,
};
static const u16 mgmt_events[] = {
@@ -163,6 +165,8 @@ static const u16 mgmt_events[] = {
MGMT_EV_PHY_CONFIGURATION_CHANGED,
MGMT_EV_EXP_FEATURE_CHANGED,
MGMT_EV_DEVICE_FLAGS_CHANGED,
+ MGMT_EV_CONTROLLER_SUSPEND,
+ MGMT_EV_CONTROLLER_RESUME,
};
static const u16 mgmt_untrusted_commands[] = {
@@ -172,7 +176,7 @@ static const u16 mgmt_untrusted_commands[] = {
MGMT_OP_READ_CONFIG_INFO,
MGMT_OP_READ_EXT_INDEX_LIST,
MGMT_OP_READ_EXT_INFO,
- MGMT_OP_READ_SECURITY_INFO,
+ MGMT_OP_READ_CONTROLLER_CAP,
MGMT_OP_READ_EXP_FEATURES_INFO,
MGMT_OP_READ_DEF_SYSTEM_CONFIG,
MGMT_OP_READ_DEF_RUNTIME_CONFIG,
@@ -782,7 +786,8 @@ static u32 get_supported_settings(struct hci_dev *hdev)
if (lmp_ssp_capable(hdev)) {
settings |= MGMT_SETTING_SSP;
- settings |= MGMT_SETTING_HS;
+ if (IS_ENABLED(CONFIG_BT_HS))
+ settings |= MGMT_SETTING_HS;
}
if (lmp_sc_capable(hdev))
@@ -1815,6 +1820,10 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
bt_dev_dbg(hdev, "sock %p", sk);
+ if (!IS_ENABLED(CONFIG_BT_HS))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
status = mgmt_bredr_support(hdev);
if (status)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, status);
@@ -3380,7 +3389,7 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev,
void *data, u16 len)
{
- struct mgmt_rp_get_phy_confguration rp;
+ struct mgmt_rp_get_phy_configuration rp;
bt_dev_dbg(hdev, "sock %p", sk);
@@ -3444,7 +3453,7 @@ unlock:
static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev,
void *data, u16 len)
{
- struct mgmt_cp_set_phy_confguration *cp = data;
+ struct mgmt_cp_set_phy_configuration *cp = data;
struct hci_cp_le_set_default_phy cp_phy;
struct mgmt_pending_cmd *cmd;
struct hci_request req;
@@ -3701,13 +3710,14 @@ unlock:
return err;
}
-static int read_security_info(struct sock *sk, struct hci_dev *hdev,
- void *data, u16 data_len)
+static int read_controller_cap(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
{
- char buf[16];
- struct mgmt_rp_read_security_info *rp = (void *)buf;
- u16 sec_len = 0;
+ char buf[20];
+ struct mgmt_rp_read_controller_cap *rp = (void *)buf;
+ u16 cap_len = 0;
u8 flags = 0;
+ u8 tx_power_range[2];
bt_dev_dbg(hdev, "sock %p", sk);
@@ -3731,23 +3741,37 @@ static int read_security_info(struct sock *sk, struct hci_dev *hdev,
flags |= 0x08; /* Encryption key size enforcement (LE) */
- sec_len = eir_append_data(rp->sec, sec_len, 0x01, &flags, 1);
+ cap_len = eir_append_data(rp->cap, cap_len, MGMT_CAP_SEC_FLAGS,
+ &flags, 1);
/* When the Read Simple Pairing Options command is supported, then
* also max encryption key size information is provided.
*/
if (hdev->commands[41] & 0x08)
- sec_len = eir_append_le16(rp->sec, sec_len, 0x02,
+ cap_len = eir_append_le16(rp->cap, cap_len,
+ MGMT_CAP_MAX_ENC_KEY_SIZE,
hdev->max_enc_key_size);
- sec_len = eir_append_le16(rp->sec, sec_len, 0x03, SMP_MAX_ENC_KEY_SIZE);
+ cap_len = eir_append_le16(rp->cap, cap_len,
+ MGMT_CAP_SMP_MAX_ENC_KEY_SIZE,
+ SMP_MAX_ENC_KEY_SIZE);
+
+ /* Append the min/max LE tx power parameters if we were able to fetch
+ * it from the controller
+ */
+ if (hdev->commands[38] & 0x80) {
+ memcpy(&tx_power_range[0], &hdev->min_le_tx_power, 1);
+ memcpy(&tx_power_range[1], &hdev->max_le_tx_power, 1);
+ cap_len = eir_append_data(rp->cap, cap_len, MGMT_CAP_LE_TX_PWR,
+ tx_power_range, 2);
+ }
- rp->sec_len = cpu_to_le16(sec_len);
+ rp->cap_len = cpu_to_le16(cap_len);
hci_dev_unlock(hdev);
- return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_SECURITY_INFO, 0,
- rp, sizeof(*rp) + sec_len);
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_CONTROLLER_CAP, 0,
+ rp, sizeof(*rp) + cap_len);
}
#ifdef CONFIG_BT_FEATURE_DEBUG
@@ -4157,7 +4181,7 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev,
{
struct adv_monitor *monitor = NULL;
struct mgmt_rp_read_adv_monitor_features *rp = NULL;
- int handle;
+ int handle, err;
size_t rp_size = 0;
__u32 supported = 0;
__u16 num_handles = 0;
@@ -4192,9 +4216,13 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev,
if (num_handles)
memcpy(&rp->handles, &handles, (num_handles * sizeof(u16)));
- return mgmt_cmd_complete(sk, hdev->id,
- MGMT_OP_READ_ADV_MONITOR_FEATURES,
- MGMT_STATUS_SUCCESS, rp, rp_size);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_READ_ADV_MONITOR_FEATURES,
+ MGMT_STATUS_SUCCESS, rp, rp_size);
+
+ kfree(rp);
+
+ return err;
}
static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev,
@@ -7192,6 +7220,10 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
flags |= MGMT_ADV_FLAG_APPEARANCE;
flags |= MGMT_ADV_FLAG_LOCAL_NAME;
+ flags |= MGMT_ADV_PARAM_DURATION;
+ flags |= MGMT_ADV_PARAM_TIMEOUT;
+ flags |= MGMT_ADV_PARAM_INTERVALS;
+ flags |= MGMT_ADV_PARAM_TX_POWER;
/* In extended adv TX_POWER returned from Set Adv Param
* will be always valid.
@@ -7202,6 +7234,8 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
if (ext_adv_capable(hdev)) {
flags |= MGMT_ADV_FLAG_SEC_1M;
+ flags |= MGMT_ADV_FLAG_HW_OFFLOAD;
+ flags |= MGMT_ADV_FLAG_CAN_SET_TX_POWER;
if (hdev->le_features[1] & HCI_LE_PHY_2M)
flags |= MGMT_ADV_FLAG_SEC_2M;
@@ -7250,7 +7284,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
rp->supported_flags = cpu_to_le32(supported_flags);
rp->max_adv_data_len = HCI_MAX_AD_LENGTH;
rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH;
- rp->max_instances = HCI_MAX_ADV_INSTANCES;
+ rp->max_instances = hdev->le_num_of_adv_sets;
rp->num_instances = hdev->adv_instance_cnt;
instance = rp->instance;
@@ -7364,6 +7398,31 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
return true;
}
+static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags)
+{
+ u32 supported_flags, phy_flags;
+
+ /* The current implementation only supports a subset of the specified
+ * flags. Also need to check mutual exclusiveness of sec flags.
+ */
+ supported_flags = get_supported_adv_flags(hdev);
+ phy_flags = adv_flags & MGMT_ADV_FLAG_SEC_MASK;
+ if (adv_flags & ~supported_flags ||
+ ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags)))))
+ return false;
+
+ return true;
+}
+
+static bool adv_busy(struct hci_dev *hdev)
+{
+ return (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) ||
+ pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) ||
+ pending_find(MGMT_OP_SET_LE, hdev) ||
+ pending_find(MGMT_OP_ADD_EXT_ADV_PARAMS, hdev) ||
+ pending_find(MGMT_OP_ADD_EXT_ADV_DATA, hdev));
+}
+
static void add_advertising_complete(struct hci_dev *hdev, u8 status,
u16 opcode)
{
@@ -7378,6 +7437,8 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status,
hci_dev_lock(hdev);
cmd = pending_find(MGMT_OP_ADD_ADVERTISING, hdev);
+ if (!cmd)
+ cmd = pending_find(MGMT_OP_ADD_EXT_ADV_DATA, hdev);
list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) {
if (!adv_instance->pending)
@@ -7422,7 +7483,6 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
struct mgmt_cp_add_advertising *cp = data;
struct mgmt_rp_add_advertising rp;
u32 flags;
- u32 supported_flags, phy_flags;
u8 status;
u16 timeout, duration;
unsigned int prev_instance_cnt = hdev->adv_instance_cnt;
@@ -7446,7 +7506,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
MGMT_STATUS_NOT_SUPPORTED);
- if (cp->instance < 1 || cp->instance > HCI_MAX_ADV_INSTANCES)
+ if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
@@ -7458,13 +7518,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
timeout = __le16_to_cpu(cp->timeout);
duration = __le16_to_cpu(cp->duration);
- /* The current implementation only supports a subset of the specified
- * flags. Also need to check mutual exclusiveness of sec flags.
- */
- supported_flags = get_supported_adv_flags(hdev);
- phy_flags = flags & MGMT_ADV_FLAG_SEC_MASK;
- if (flags & ~supported_flags ||
- ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags)))))
+ if (!requested_adv_flags_are_valid(hdev, flags))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
@@ -7476,9 +7530,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) ||
- pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) ||
- pending_find(MGMT_OP_SET_LE, hdev)) {
+ if (adv_busy(hdev)) {
err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_BUSY);
goto unlock;
@@ -7496,7 +7548,10 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
cp->adv_data_len, cp->data,
cp->scan_rsp_len,
cp->data + cp->adv_data_len,
- timeout, duration);
+ timeout, duration,
+ HCI_ADV_TX_POWER_NO_PREFERENCE,
+ hdev->le_adv_min_interval,
+ hdev->le_adv_max_interval);
if (err < 0) {
err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_FAILED);
@@ -7569,6 +7624,338 @@ unlock:
return err;
}
+static void add_ext_adv_params_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct mgmt_cp_add_ext_adv_params *cp;
+ struct mgmt_rp_add_ext_adv_params rp;
+ struct adv_info *adv_instance;
+ u32 flags;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_ADD_EXT_ADV_PARAMS, hdev);
+ if (!cmd)
+ goto unlock;
+
+ cp = cmd->param;
+ adv_instance = hci_find_adv_instance(hdev, cp->instance);
+ if (!adv_instance)
+ goto unlock;
+
+ rp.instance = cp->instance;
+ rp.tx_power = adv_instance->tx_power;
+
+ /* While we're at it, inform userspace of the available space for this
+ * advertisement, given the flags that will be used.
+ */
+ flags = __le32_to_cpu(cp->flags);
+ rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true);
+ rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false);
+
+ if (status) {
+ /* If this advertisement was previously advertising and we
+ * failed to update it, we signal that it has been removed and
+ * delete its structure
+ */
+ if (!adv_instance->pending)
+ mgmt_advertising_removed(cmd->sk, hdev, cp->instance);
+
+ hci_remove_adv_instance(hdev, cp->instance);
+
+ mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode,
+ mgmt_status(status));
+
+ } else {
+ mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode,
+ mgmt_status(status), &rp, sizeof(rp));
+ }
+
+unlock:
+ if (cmd)
+ mgmt_pending_remove(cmd);
+
+ hci_dev_unlock(hdev);
+}
+
+static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_add_ext_adv_params *cp = data;
+ struct mgmt_rp_add_ext_adv_params rp;
+ struct mgmt_pending_cmd *cmd = NULL;
+ struct adv_info *adv_instance;
+ struct hci_request req;
+ u32 flags, min_interval, max_interval;
+ u16 timeout, duration;
+ u8 status;
+ s8 tx_power;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ status = mgmt_le_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ status);
+
+ if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* The purpose of breaking add_advertising into two separate MGMT calls
+ * for params and data is to allow more parameters to be added to this
+ * structure in the future. For this reason, we verify that we have the
+ * bare minimum structure we know of when the interface was defined. Any
+ * extra parameters we don't know about will be ignored in this request.
+ */
+ if (data_len < MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ flags = __le32_to_cpu(cp->flags);
+
+ if (!requested_adv_flags_are_valid(hdev, flags))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ /* In new interface, we require that we are powered to register */
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ if (adv_busy(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ /* Parse defined parameters from request, use defaults otherwise */
+ timeout = (flags & MGMT_ADV_PARAM_TIMEOUT) ?
+ __le16_to_cpu(cp->timeout) : 0;
+
+ duration = (flags & MGMT_ADV_PARAM_DURATION) ?
+ __le16_to_cpu(cp->duration) :
+ hdev->def_multi_adv_rotation_duration;
+
+ min_interval = (flags & MGMT_ADV_PARAM_INTERVALS) ?
+ __le32_to_cpu(cp->min_interval) :
+ hdev->le_adv_min_interval;
+
+ max_interval = (flags & MGMT_ADV_PARAM_INTERVALS) ?
+ __le32_to_cpu(cp->max_interval) :
+ hdev->le_adv_max_interval;
+
+ tx_power = (flags & MGMT_ADV_PARAM_TX_POWER) ?
+ cp->tx_power :
+ HCI_ADV_TX_POWER_NO_PREFERENCE;
+
+ /* Create advertising instance with no advertising or response data */
+ err = hci_add_adv_instance(hdev, cp->instance, flags,
+ 0, NULL, 0, NULL, timeout, duration,
+ tx_power, min_interval, max_interval);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_FAILED);
+ goto unlock;
+ }
+
+ hdev->cur_adv_instance = cp->instance;
+ /* Submit request for advertising params if ext adv available */
+ if (ext_adv_capable(hdev)) {
+ hci_req_init(&req, hdev);
+ adv_instance = hci_find_adv_instance(hdev, cp->instance);
+
+ /* Updating parameters of an active instance will return a
+ * Command Disallowed error, so we must first disable the
+ * instance if it is active.
+ */
+ if (!adv_instance->pending)
+ __hci_req_disable_ext_adv_instance(&req, cp->instance);
+
+ __hci_req_setup_ext_adv_instance(&req, cp->instance);
+
+ err = hci_req_run(&req, add_ext_adv_params_complete);
+
+ if (!err)
+ cmd = mgmt_pending_add(sk, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ hdev, data, data_len);
+ if (!cmd) {
+ err = -ENOMEM;
+ hci_remove_adv_instance(hdev, cp->instance);
+ goto unlock;
+ }
+
+ } else {
+ rp.instance = cp->instance;
+ rp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE;
+ rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true);
+ rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static int add_ext_adv_data(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_cp_add_ext_adv_data *cp = data;
+ struct mgmt_rp_add_ext_adv_data rp;
+ u8 schedule_instance = 0;
+ struct adv_info *next_instance;
+ struct adv_info *adv_instance;
+ int err = 0;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ adv_instance = hci_find_adv_instance(hdev, cp->instance);
+
+ if (!adv_instance) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ /* In new interface, we require that we are powered to register */
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_REJECTED);
+ goto clear_new_instance;
+ }
+
+ if (adv_busy(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_BUSY);
+ goto clear_new_instance;
+ }
+
+ /* Validate new data */
+ if (!tlv_data_is_valid(hdev, adv_instance->flags, cp->data,
+ cp->adv_data_len, true) ||
+ !tlv_data_is_valid(hdev, adv_instance->flags, cp->data +
+ cp->adv_data_len, cp->scan_rsp_len, false)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto clear_new_instance;
+ }
+
+ /* Set the data in the advertising instance */
+ hci_set_adv_instance_data(hdev, cp->instance, cp->adv_data_len,
+ cp->data, cp->scan_rsp_len,
+ cp->data + cp->adv_data_len);
+
+ /* We're good to go, update advertising data, parameters, and start
+ * advertising.
+ */
+
+ hci_req_init(&req, hdev);
+
+ hci_req_add(&req, HCI_OP_READ_LOCAL_NAME, 0, NULL);
+
+ if (ext_adv_capable(hdev)) {
+ __hci_req_update_adv_data(&req, cp->instance);
+ __hci_req_update_scan_rsp_data(&req, cp->instance);
+ __hci_req_enable_ext_advertising(&req, cp->instance);
+
+ } else {
+ /* If using software rotation, determine next instance to use */
+
+ if (hdev->cur_adv_instance == cp->instance) {
+ /* If the currently advertised instance is being changed
+ * then cancel the current advertising and schedule the
+ * next instance. If there is only one instance then the
+ * overridden advertising data will be visible right
+ * away
+ */
+ cancel_adv_timeout(hdev);
+
+ next_instance = hci_get_next_instance(hdev,
+ cp->instance);
+ if (next_instance)
+ schedule_instance = next_instance->instance;
+ } else if (!hdev->adv_instance_timeout) {
+ /* Immediately advertise the new instance if no other
+ * instance is currently being advertised.
+ */
+ schedule_instance = cp->instance;
+ }
+
+ /* If the HCI_ADVERTISING flag is set or there is no instance to
+ * be advertised then we have no HCI communication to make.
+ * Simply return.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ !schedule_instance) {
+ if (adv_instance->pending) {
+ mgmt_advertising_added(sk, hdev, cp->instance);
+ adv_instance->pending = false;
+ }
+ rp.instance = cp->instance;
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_SUCCESS, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ err = __hci_req_schedule_adv_instance(&req, schedule_instance,
+ true);
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_ADD_EXT_ADV_DATA, hdev, data,
+ data_len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto clear_new_instance;
+ }
+
+ if (!err)
+ err = hci_req_run(&req, add_advertising_complete);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_FAILED);
+ mgmt_pending_remove(cmd);
+ goto clear_new_instance;
+ }
+
+ /* We were successful in updating data, so trigger advertising_added
+ * event if this is an instance that wasn't previously advertising. If
+ * a failure occurs in the requests we initiated, we will remove the
+ * instance again in add_advertising_complete
+ */
+ if (adv_instance->pending)
+ mgmt_advertising_added(sk, hdev, cp->instance);
+
+ goto unlock;
+
+clear_new_instance:
+ hci_remove_adv_instance(hdev, cp->instance);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
static void remove_advertising_complete(struct hci_dev *hdev, u8 status,
u16 opcode)
{
@@ -7699,7 +8086,7 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
MGMT_STATUS_REJECTED);
- if (cp->instance < 1 || cp->instance > HCI_MAX_ADV_INSTANCES)
+ if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
MGMT_STATUS_INVALID_PARAMS);
@@ -7821,7 +8208,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE,
HCI_MGMT_VAR_LEN },
{ set_wideband_speech, MGMT_SETTING_SIZE },
- { read_security_info, MGMT_READ_SECURITY_INFO_SIZE,
+ { read_controller_cap, MGMT_READ_CONTROLLER_CAP_SIZE,
HCI_MGMT_UNTRUSTED },
{ read_exp_features_info, MGMT_READ_EXP_FEATURES_INFO_SIZE,
HCI_MGMT_UNTRUSTED |
@@ -7843,6 +8230,10 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ add_adv_patterns_monitor,MGMT_ADD_ADV_PATTERNS_MONITOR_SIZE,
HCI_MGMT_VAR_LEN },
{ remove_adv_monitor, MGMT_REMOVE_ADV_MONITOR_SIZE },
+ { add_ext_adv_params, MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { add_ext_adv_data, MGMT_ADD_EXT_ADV_DATA_SIZE,
+ HCI_MGMT_VAR_LEN },
};
void mgmt_index_added(struct hci_dev *hdev)
@@ -8262,6 +8653,10 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr,
ev.addr.type = link_to_bdaddr(link_type, addr_type);
ev.reason = reason;
+ /* Report disconnects due to suspend */
+ if (hdev->suspended)
+ ev.reason = MGMT_DEV_DISCONN_LOCAL_HOST_SUSPEND;
+
mgmt_event(MGMT_EV_DEVICE_DISCONNECTED, hdev, &ev, sizeof(ev), sk);
if (sk)
@@ -8868,6 +9263,30 @@ void mgmt_discovering(struct hci_dev *hdev, u8 discovering)
mgmt_event(MGMT_EV_DISCOVERING, hdev, &ev, sizeof(ev), NULL);
}
+void mgmt_suspending(struct hci_dev *hdev, u8 state)
+{
+ struct mgmt_ev_controller_suspend ev;
+
+ ev.suspend_state = state;
+ mgmt_event(MGMT_EV_CONTROLLER_SUSPEND, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_resuming(struct hci_dev *hdev, u8 reason, bdaddr_t *bdaddr,
+ u8 addr_type)
+{
+ struct mgmt_ev_controller_resume ev;
+
+ ev.wake_reason = reason;
+ if (bdaddr) {
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = addr_type;
+ } else {
+ memset(&ev.addr, 0, sizeof(ev.addr));
+ }
+
+ mgmt_event(MGMT_EV_CONTROLLER_RESUME, hdev, &ev, sizeof(ev), NULL);
+}
+
static struct hci_mgmt_chan chan = {
.channel = HCI_CHANNEL_CONTROL,
.handler_count = ARRAY_SIZE(mgmt_handlers),
diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c
index b30b571f8caf..1deb0ca7a929 100644
--- a/net/bluetooth/mgmt_config.c
+++ b/net/bluetooth/mgmt_config.c
@@ -11,74 +11,119 @@
#include "mgmt_util.h"
#include "mgmt_config.h"
-#define HDEV_PARAM_U16(_param_code_, _param_name_) \
-{ \
- { cpu_to_le16(_param_code_), sizeof(__u16) }, \
- { cpu_to_le16(hdev->_param_name_) } \
-}
+#define HDEV_PARAM_U16(_param_name_) \
+ struct {\
+ struct mgmt_tlv entry; \
+ __le16 value; \
+ } __packed _param_name_
-#define HDEV_PARAM_U16_JIFFIES_TO_MSECS(_param_code_, _param_name_) \
-{ \
- { cpu_to_le16(_param_code_), sizeof(__u16) }, \
- { cpu_to_le16(jiffies_to_msecs(hdev->_param_name_)) } \
-}
+#define HDEV_PARAM_U8(_param_name_) \
+ struct {\
+ struct mgmt_tlv entry; \
+ __u8 value; \
+ } __packed _param_name_
+
+#define TLV_SET_U16(_param_code_, _param_name_) \
+ { \
+ { cpu_to_le16(_param_code_), sizeof(__u16) }, \
+ cpu_to_le16(hdev->_param_name_) \
+ }
+
+#define TLV_SET_U8(_param_code_, _param_name_) \
+ { \
+ { cpu_to_le16(_param_code_), sizeof(__u8) }, \
+ hdev->_param_name_ \
+ }
+
+#define TLV_SET_U16_JIFFIES_TO_MSECS(_param_code_, _param_name_) \
+ { \
+ { cpu_to_le16(_param_code_), sizeof(__u16) }, \
+ cpu_to_le16(jiffies_to_msecs(hdev->_param_name_)) \
+ }
int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
u16 data_len)
{
- struct {
- struct mgmt_tlv entry;
- union {
- /* This is a simplification for now since all values
- * are 16 bits. In the future, this code may need
- * refactoring to account for variable length values
- * and properly calculate the required buffer size.
- */
- __le16 value;
- };
- } __packed params[] = {
+ int ret;
+ struct mgmt_rp_read_def_system_config {
/* Please see mgmt-api.txt for documentation of these values */
- HDEV_PARAM_U16(0x0000, def_page_scan_type),
- HDEV_PARAM_U16(0x0001, def_page_scan_int),
- HDEV_PARAM_U16(0x0002, def_page_scan_window),
- HDEV_PARAM_U16(0x0003, def_inq_scan_type),
- HDEV_PARAM_U16(0x0004, def_inq_scan_int),
- HDEV_PARAM_U16(0x0005, def_inq_scan_window),
- HDEV_PARAM_U16(0x0006, def_br_lsto),
- HDEV_PARAM_U16(0x0007, def_page_timeout),
- HDEV_PARAM_U16(0x0008, sniff_min_interval),
- HDEV_PARAM_U16(0x0009, sniff_max_interval),
- HDEV_PARAM_U16(0x000a, le_adv_min_interval),
- HDEV_PARAM_U16(0x000b, le_adv_max_interval),
- HDEV_PARAM_U16(0x000c, def_multi_adv_rotation_duration),
- HDEV_PARAM_U16(0x000d, le_scan_interval),
- HDEV_PARAM_U16(0x000e, le_scan_window),
- HDEV_PARAM_U16(0x000f, le_scan_int_suspend),
- HDEV_PARAM_U16(0x0010, le_scan_window_suspend),
- HDEV_PARAM_U16(0x0011, le_scan_int_discovery),
- HDEV_PARAM_U16(0x0012, le_scan_window_discovery),
- HDEV_PARAM_U16(0x0013, le_scan_int_adv_monitor),
- HDEV_PARAM_U16(0x0014, le_scan_window_adv_monitor),
- HDEV_PARAM_U16(0x0015, le_scan_int_connect),
- HDEV_PARAM_U16(0x0016, le_scan_window_connect),
- HDEV_PARAM_U16(0x0017, le_conn_min_interval),
- HDEV_PARAM_U16(0x0018, le_conn_max_interval),
- HDEV_PARAM_U16(0x0019, le_conn_latency),
- HDEV_PARAM_U16(0x001a, le_supv_timeout),
- HDEV_PARAM_U16_JIFFIES_TO_MSECS(0x001b,
- def_le_autoconnect_timeout),
+ HDEV_PARAM_U16(def_page_scan_type);
+ HDEV_PARAM_U16(def_page_scan_int);
+ HDEV_PARAM_U16(def_page_scan_window);
+ HDEV_PARAM_U16(def_inq_scan_type);
+ HDEV_PARAM_U16(def_inq_scan_int);
+ HDEV_PARAM_U16(def_inq_scan_window);
+ HDEV_PARAM_U16(def_br_lsto);
+ HDEV_PARAM_U16(def_page_timeout);
+ HDEV_PARAM_U16(sniff_min_interval);
+ HDEV_PARAM_U16(sniff_max_interval);
+ HDEV_PARAM_U16(le_adv_min_interval);
+ HDEV_PARAM_U16(le_adv_max_interval);
+ HDEV_PARAM_U16(def_multi_adv_rotation_duration);
+ HDEV_PARAM_U16(le_scan_interval);
+ HDEV_PARAM_U16(le_scan_window);
+ HDEV_PARAM_U16(le_scan_int_suspend);
+ HDEV_PARAM_U16(le_scan_window_suspend);
+ HDEV_PARAM_U16(le_scan_int_discovery);
+ HDEV_PARAM_U16(le_scan_window_discovery);
+ HDEV_PARAM_U16(le_scan_int_adv_monitor);
+ HDEV_PARAM_U16(le_scan_window_adv_monitor);
+ HDEV_PARAM_U16(le_scan_int_connect);
+ HDEV_PARAM_U16(le_scan_window_connect);
+ HDEV_PARAM_U16(le_conn_min_interval);
+ HDEV_PARAM_U16(le_conn_max_interval);
+ HDEV_PARAM_U16(le_conn_latency);
+ HDEV_PARAM_U16(le_supv_timeout);
+ HDEV_PARAM_U16(def_le_autoconnect_timeout);
+ HDEV_PARAM_U16(advmon_allowlist_duration);
+ HDEV_PARAM_U16(advmon_no_filter_duration);
+ HDEV_PARAM_U8(enable_advmon_interleave_scan);
+ } __packed rp = {
+ TLV_SET_U16(0x0000, def_page_scan_type),
+ TLV_SET_U16(0x0001, def_page_scan_int),
+ TLV_SET_U16(0x0002, def_page_scan_window),
+ TLV_SET_U16(0x0003, def_inq_scan_type),
+ TLV_SET_U16(0x0004, def_inq_scan_int),
+ TLV_SET_U16(0x0005, def_inq_scan_window),
+ TLV_SET_U16(0x0006, def_br_lsto),
+ TLV_SET_U16(0x0007, def_page_timeout),
+ TLV_SET_U16(0x0008, sniff_min_interval),
+ TLV_SET_U16(0x0009, sniff_max_interval),
+ TLV_SET_U16(0x000a, le_adv_min_interval),
+ TLV_SET_U16(0x000b, le_adv_max_interval),
+ TLV_SET_U16(0x000c, def_multi_adv_rotation_duration),
+ TLV_SET_U16(0x000d, le_scan_interval),
+ TLV_SET_U16(0x000e, le_scan_window),
+ TLV_SET_U16(0x000f, le_scan_int_suspend),
+ TLV_SET_U16(0x0010, le_scan_window_suspend),
+ TLV_SET_U16(0x0011, le_scan_int_discovery),
+ TLV_SET_U16(0x0012, le_scan_window_discovery),
+ TLV_SET_U16(0x0013, le_scan_int_adv_monitor),
+ TLV_SET_U16(0x0014, le_scan_window_adv_monitor),
+ TLV_SET_U16(0x0015, le_scan_int_connect),
+ TLV_SET_U16(0x0016, le_scan_window_connect),
+ TLV_SET_U16(0x0017, le_conn_min_interval),
+ TLV_SET_U16(0x0018, le_conn_max_interval),
+ TLV_SET_U16(0x0019, le_conn_latency),
+ TLV_SET_U16(0x001a, le_supv_timeout),
+ TLV_SET_U16_JIFFIES_TO_MSECS(0x001b,
+ def_le_autoconnect_timeout),
+ TLV_SET_U16(0x001d, advmon_allowlist_duration),
+ TLV_SET_U16(0x001e, advmon_no_filter_duration),
+ TLV_SET_U8(0x001f, enable_advmon_interleave_scan),
};
- struct mgmt_rp_read_def_system_config *rp = (void *)params;
bt_dev_dbg(hdev, "sock %p", sk);
- return mgmt_cmd_complete(sk, hdev->id,
- MGMT_OP_READ_DEF_SYSTEM_CONFIG,
- 0, rp, sizeof(params));
+ ret = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_READ_DEF_SYSTEM_CONFIG,
+ 0, &rp, sizeof(rp));
+ return ret;
}
#define TO_TLV(x) ((struct mgmt_tlv *)(x))
#define TLV_GET_LE16(tlv) le16_to_cpu(*((__le16 *)(TO_TLV(tlv)->value)))
+#define TLV_GET_U8(tlv) (*((__u8 *)(TO_TLV(tlv)->value)))
int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
u16 data_len)
@@ -95,6 +140,7 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
/* First pass to validate the tlv */
while (buffer_left >= sizeof(struct mgmt_tlv)) {
const u8 len = TO_TLV(buffer)->length;
+ size_t exp_type_len;
const u16 exp_len = sizeof(struct mgmt_tlv) +
len;
const u16 type = le16_to_cpu(TO_TLV(buffer)->type);
@@ -138,20 +184,28 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
case 0x0019:
case 0x001a:
case 0x001b:
- if (len != sizeof(u16)) {
- bt_dev_warn(hdev, "invalid length %d, exp %zu for type %d",
- len, sizeof(u16), type);
-
- return mgmt_cmd_status(sk, hdev->id,
- MGMT_OP_SET_DEF_SYSTEM_CONFIG,
- MGMT_STATUS_INVALID_PARAMS);
- }
+ case 0x001d:
+ case 0x001e:
+ exp_type_len = sizeof(u16);
+ break;
+ case 0x001f:
+ exp_type_len = sizeof(u8);
break;
default:
+ exp_type_len = 0;
bt_dev_warn(hdev, "unsupported parameter %u", type);
break;
}
+ if (exp_type_len && len != exp_type_len) {
+ bt_dev_warn(hdev, "invalid length %d, exp %zu for type %d",
+ len, exp_type_len, type);
+
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
buffer_left -= exp_len;
buffer += exp_len;
}
@@ -251,6 +305,15 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
hdev->def_le_autoconnect_timeout =
msecs_to_jiffies(TLV_GET_LE16(buffer));
break;
+ case 0x0001d:
+ hdev->advmon_allowlist_duration = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001e:
+ hdev->advmon_no_filter_duration = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001f:
+ hdev->enable_advmon_interleave_scan = TLV_GET_U8(buffer);
+ break;
default:
bt_dev_warn(hdev, "unsupported parameter %u", type);
break;
diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c
index 8579bfeb2836..4b39534a14a1 100644
--- a/net/bluetooth/msft.c
+++ b/net/bluetooth/msft.c
@@ -12,12 +12,13 @@
struct msft_cp_read_supported_features {
__u8 sub_opcode;
} __packed;
+
struct msft_rp_read_supported_features {
__u8 status;
__u8 sub_opcode;
__le64 features;
__u8 evt_prefix_len;
- __u8 evt_prefix[0];
+ __u8 evt_prefix[];
} __packed;
struct msft_data {
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index dcf7f96ff417..22a110f37abc 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1001,6 +1001,17 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname,
err = -EFAULT;
break;
+ case BT_SNDMTU:
+ case BT_RCVMTU:
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ if (put_user(sco_pi(sk)->conn->mtu, (u32 __user *)optval))
+ err = -EFAULT;
+ break;
+
default:
err = -ENOPROTOOPT;
break;
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index bf4bef13d935..c659c464f7ca 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -3353,31 +3353,8 @@ static void smp_del_chan(struct l2cap_chan *chan)
l2cap_chan_put(chan);
}
-static ssize_t force_bredr_smp_read(struct file *file,
- char __user *user_buf,
- size_t count, loff_t *ppos)
+int smp_force_bredr(struct hci_dev *hdev, bool enable)
{
- struct hci_dev *hdev = file->private_data;
- char buf[3];
-
- buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP) ? 'Y': 'N';
- buf[1] = '\n';
- buf[2] = '\0';
- return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
-}
-
-static ssize_t force_bredr_smp_write(struct file *file,
- const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- bool enable;
- int err;
-
- err = kstrtobool_from_user(user_buf, count, &enable);
- if (err)
- return err;
-
if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
return -EALREADY;
@@ -3399,16 +3376,9 @@ static ssize_t force_bredr_smp_write(struct file *file,
hci_dev_change_flag(hdev, HCI_FORCE_BREDR_SMP);
- return count;
+ return 0;
}
-static const struct file_operations force_bredr_smp_fops = {
- .open = simple_open,
- .read = force_bredr_smp_read,
- .write = force_bredr_smp_write,
- .llseek = default_llseek,
-};
-
int smp_register(struct hci_dev *hdev)
{
struct l2cap_chan *chan;
@@ -3433,17 +3403,7 @@ int smp_register(struct hci_dev *hdev)
hdev->smp_data = chan;
- /* If the controller does not support BR/EDR Secure Connections
- * feature, then the BR/EDR SMP channel shall not be present.
- *
- * To test this with Bluetooth 4.0 controllers, create a debugfs
- * switch that allows forcing BR/EDR SMP support and accepting
- * cross-transport pairing on non-AES encrypted connections.
- */
if (!lmp_sc_capable(hdev)) {
- debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
- hdev, &force_bredr_smp_fops);
-
/* Flag can be already set here (due to power toggle) */
if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
return 0;
diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h
index 121edadd5f8d..fc35a8bf358e 100644
--- a/net/bluetooth/smp.h
+++ b/net/bluetooth/smp.h
@@ -193,6 +193,8 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa);
int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]);
+int smp_force_bredr(struct hci_dev *hdev, bool enable);
+
int smp_register(struct hci_dev *hdev);
void smp_unregister(struct hci_dev *hdev);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index a66f211726e7..8b796c499cbb 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -11,6 +11,7 @@
#include <net/sock.h>
#include <net/tcp.h>
#include <linux/error-injection.h>
+#include <linux/smp.h>
#define CREATE_TRACE_POINTS
#include <trace/events/bpf_test_run.h>
@@ -204,6 +205,9 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
int b = 2, err = -EFAULT;
u32 retval = 0;
+ if (kattr->test.flags || kattr->test.cpu)
+ return -EINVAL;
+
switch (prog->expected_attach_type) {
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
@@ -236,6 +240,85 @@ out:
return err;
}
+struct bpf_raw_tp_test_run_info {
+ struct bpf_prog *prog;
+ void *ctx;
+ u32 retval;
+};
+
+static void
+__bpf_prog_test_run_raw_tp(void *data)
+{
+ struct bpf_raw_tp_test_run_info *info = data;
+
+ rcu_read_lock();
+ info->retval = BPF_PROG_RUN(info->prog, info->ctx);
+ rcu_read_unlock();
+}
+
+int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in);
+ __u32 ctx_size_in = kattr->test.ctx_size_in;
+ struct bpf_raw_tp_test_run_info info;
+ int cpu = kattr->test.cpu, err = 0;
+ int current_cpu;
+
+ /* doesn't support data_in/out, ctx_out, duration, or repeat */
+ if (kattr->test.data_in || kattr->test.data_out ||
+ kattr->test.ctx_out || kattr->test.duration ||
+ kattr->test.repeat)
+ return -EINVAL;
+
+ if (ctx_size_in < prog->aux->max_ctx_offset ||
+ ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64))
+ return -EINVAL;
+
+ if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0)
+ return -EINVAL;
+
+ if (ctx_size_in) {
+ info.ctx = kzalloc(ctx_size_in, GFP_USER);
+ if (!info.ctx)
+ return -ENOMEM;
+ if (copy_from_user(info.ctx, ctx_in, ctx_size_in)) {
+ err = -EFAULT;
+ goto out;
+ }
+ } else {
+ info.ctx = NULL;
+ }
+
+ info.prog = prog;
+
+ current_cpu = get_cpu();
+ if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 ||
+ cpu == current_cpu) {
+ __bpf_prog_test_run_raw_tp(&info);
+ } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+ /* smp_call_function_single() also checks cpu_online()
+ * after csd_lock(). However, since cpu is from user
+ * space, let's do an extra quick check to filter out
+ * invalid value before smp_call_function_single().
+ */
+ err = -ENXIO;
+ } else {
+ err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp,
+ &info, 1);
+ }
+ put_cpu();
+
+ if (!err &&
+ copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32)))
+ err = -EFAULT;
+
+out:
+ kfree(info.ctx);
+ return err;
+}
+
static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size)
{
void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in);
@@ -410,6 +493,9 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
void *data;
int ret;
+ if (kattr->test.flags || kattr->test.cpu)
+ return -EINVAL;
+
data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN,
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
if (IS_ERR(data))
@@ -607,6 +693,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
return -EINVAL;
+ if (kattr->test.flags || kattr->test.cpu)
+ return -EINVAL;
+
if (size < ETH_HLEN)
return -EINVAL;
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
index 73d0b12789f1..8ad0233ce497 100644
--- a/net/bpfilter/Kconfig
+++ b/net/bpfilter/Kconfig
@@ -2,6 +2,7 @@
menuconfig BPFILTER
bool "BPF based packet filtering framework (BPFILTER)"
depends on NET && BPF && INET
+ select USERMODE_DRIVER
help
This builds experimental bpfilter framework that is aiming to
provide netfilter compatible functionality via BPF
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
index 80879196560c..3c8ded7d3e84 100644
--- a/net/bridge/Kconfig
+++ b/net/bridge/Kconfig
@@ -73,3 +73,14 @@ config BRIDGE_MRP
Say N to exclude this support and reduce the binary size.
If unsure, say N.
+
+config BRIDGE_CFM
+ bool "CFM protocol"
+ depends on BRIDGE
+ help
+ If you say Y here, then the Ethernet bridge will be able to run CFM
+ protocol according to 802.1Q section 12.14
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say N.
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index ccb394236fbd..4702702a74d3 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -27,3 +27,5 @@ bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
obj-$(CONFIG_NETFILTER) += netfilter/
bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o br_mrp.o br_mrp_netlink.o
+
+bridge-$(CONFIG_BRIDGE_CFM) += br_cfm.o br_cfm_netlink.o
diff --git a/net/bridge/br.c b/net/bridge/br.c
index b6fe30e3768f..1b169f8e7491 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -43,7 +43,10 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
if (event == NETDEV_REGISTER) {
/* register of bridge completed, add sysfs entries */
- br_sysfs_addbr(dev);
+ err = br_sysfs_addbr(dev);
+ if (err)
+ return notifier_from_errno(err);
+
return NOTIFY_DONE;
}
}
@@ -183,6 +186,11 @@ static int br_switchdev_event(struct notifier_block *unused,
br_fdb_offloaded_set(br, p, fdb_info->addr,
fdb_info->vid, fdb_info->offloaded);
break;
+ case SWITCHDEV_FDB_FLUSH_TO_BRIDGE:
+ fdb_info = ptr;
+ /* Don't delete static entries */
+ br_fdb_delete_by_port(br, p, fdb_info->vid, 0);
+ break;
}
out:
diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c
new file mode 100644
index 000000000000..001064f7583d
--- /dev/null
+++ b/net/bridge/br_cfm.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/cfm_bridge.h>
+#include <uapi/linux/cfm_bridge.h>
+#include "br_private_cfm.h"
+
+static struct br_cfm_mep *br_mep_find(struct net_bridge *br, u32 instance)
+{
+ struct br_cfm_mep *mep;
+
+ hlist_for_each_entry(mep, &br->mep_list, head)
+ if (mep->instance == instance)
+ return mep;
+
+ return NULL;
+}
+
+static struct br_cfm_mep *br_mep_find_ifindex(struct net_bridge *br,
+ u32 ifindex)
+{
+ struct br_cfm_mep *mep;
+
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head,
+ lockdep_rtnl_is_held())
+ if (mep->create.ifindex == ifindex)
+ return mep;
+
+ return NULL;
+}
+
+static struct br_cfm_peer_mep *br_peer_mep_find(struct br_cfm_mep *mep,
+ u32 mepid)
+{
+ struct br_cfm_peer_mep *peer_mep;
+
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head,
+ lockdep_rtnl_is_held())
+ if (peer_mep->mepid == mepid)
+ return peer_mep;
+
+ return NULL;
+}
+
+static struct net_bridge_port *br_mep_get_port(struct net_bridge *br,
+ u32 ifindex)
+{
+ struct net_bridge_port *port;
+
+ list_for_each_entry(port, &br->port_list, list)
+ if (port->dev->ifindex == ifindex)
+ return port;
+
+ return NULL;
+}
+
+/* Calculate the CCM interval in us. */
+static u32 interval_to_us(enum br_cfm_ccm_interval interval)
+{
+ switch (interval) {
+ case BR_CFM_CCM_INTERVAL_NONE:
+ return 0;
+ case BR_CFM_CCM_INTERVAL_3_3_MS:
+ return 3300;
+ case BR_CFM_CCM_INTERVAL_10_MS:
+ return 10 * 1000;
+ case BR_CFM_CCM_INTERVAL_100_MS:
+ return 100 * 1000;
+ case BR_CFM_CCM_INTERVAL_1_SEC:
+ return 1000 * 1000;
+ case BR_CFM_CCM_INTERVAL_10_SEC:
+ return 10 * 1000 * 1000;
+ case BR_CFM_CCM_INTERVAL_1_MIN:
+ return 60 * 1000 * 1000;
+ case BR_CFM_CCM_INTERVAL_10_MIN:
+ return 10 * 60 * 1000 * 1000;
+ }
+ return 0;
+}
+
+/* Convert the interface interval to CCM PDU value. */
+static u32 interval_to_pdu(enum br_cfm_ccm_interval interval)
+{
+ switch (interval) {
+ case BR_CFM_CCM_INTERVAL_NONE:
+ return 0;
+ case BR_CFM_CCM_INTERVAL_3_3_MS:
+ return 1;
+ case BR_CFM_CCM_INTERVAL_10_MS:
+ return 2;
+ case BR_CFM_CCM_INTERVAL_100_MS:
+ return 3;
+ case BR_CFM_CCM_INTERVAL_1_SEC:
+ return 4;
+ case BR_CFM_CCM_INTERVAL_10_SEC:
+ return 5;
+ case BR_CFM_CCM_INTERVAL_1_MIN:
+ return 6;
+ case BR_CFM_CCM_INTERVAL_10_MIN:
+ return 7;
+ }
+ return 0;
+}
+
+/* Convert the CCM PDU value to interval on interface. */
+static u32 pdu_to_interval(u32 value)
+{
+ switch (value) {
+ case 0:
+ return BR_CFM_CCM_INTERVAL_NONE;
+ case 1:
+ return BR_CFM_CCM_INTERVAL_3_3_MS;
+ case 2:
+ return BR_CFM_CCM_INTERVAL_10_MS;
+ case 3:
+ return BR_CFM_CCM_INTERVAL_100_MS;
+ case 4:
+ return BR_CFM_CCM_INTERVAL_1_SEC;
+ case 5:
+ return BR_CFM_CCM_INTERVAL_10_SEC;
+ case 6:
+ return BR_CFM_CCM_INTERVAL_1_MIN;
+ case 7:
+ return BR_CFM_CCM_INTERVAL_10_MIN;
+ }
+ return BR_CFM_CCM_INTERVAL_NONE;
+}
+
+static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep)
+{
+ u32 interval_us;
+
+ interval_us = interval_to_us(peer_mep->mep->cc_config.exp_interval);
+ /* Function ccm_rx_dwork must be called with 1/4
+ * of the configured CC 'expected_interval'
+ * in order to detect CCM defect after 3.25 interval.
+ */
+ queue_delayed_work(system_wq, &peer_mep->ccm_rx_dwork,
+ usecs_to_jiffies(interval_us / 4));
+}
+
+static void br_cfm_notify(int event, const struct net_bridge_port *port)
+{
+ u32 filter = RTEXT_FILTER_CFM_STATUS;
+
+ return br_info_notify(event, port->br, NULL, filter);
+}
+
+static void cc_peer_enable(struct br_cfm_peer_mep *peer_mep)
+{
+ memset(&peer_mep->cc_status, 0, sizeof(peer_mep->cc_status));
+ peer_mep->ccm_rx_count_miss = 0;
+
+ ccm_rx_timer_start(peer_mep);
+}
+
+static void cc_peer_disable(struct br_cfm_peer_mep *peer_mep)
+{
+ cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork);
+}
+
+static struct sk_buff *ccm_frame_build(struct br_cfm_mep *mep,
+ const struct br_cfm_cc_ccm_tx_info *const tx_info)
+
+{
+ struct br_cfm_common_hdr *common_hdr;
+ struct net_bridge_port *b_port;
+ struct br_cfm_maid *maid;
+ u8 *itu_reserved, *e_tlv;
+ struct ethhdr *eth_hdr;
+ struct sk_buff *skb;
+ __be32 *status_tlv;
+ __be32 *snumber;
+ __be16 *mepid;
+
+ skb = dev_alloc_skb(CFM_CCM_MAX_FRAME_LENGTH);
+ if (!skb)
+ return NULL;
+
+ rcu_read_lock();
+ b_port = rcu_dereference(mep->b_port);
+ if (!b_port) {
+ kfree_skb(skb);
+ rcu_read_unlock();
+ return NULL;
+ }
+ skb->dev = b_port->dev;
+ rcu_read_unlock();
+ /* The device cannot be deleted until the work_queue functions has
+ * completed. This function is called from ccm_tx_work_expired()
+ * that is a work_queue functions.
+ */
+
+ skb->protocol = htons(ETH_P_CFM);
+ skb->priority = CFM_FRAME_PRIO;
+
+ /* Ethernet header */
+ eth_hdr = skb_put(skb, sizeof(*eth_hdr));
+ ether_addr_copy(eth_hdr->h_dest, tx_info->dmac.addr);
+ ether_addr_copy(eth_hdr->h_source, mep->config.unicast_mac.addr);
+ eth_hdr->h_proto = htons(ETH_P_CFM);
+
+ /* Common CFM Header */
+ common_hdr = skb_put(skb, sizeof(*common_hdr));
+ common_hdr->mdlevel_version = mep->config.mdlevel << 5;
+ common_hdr->opcode = BR_CFM_OPCODE_CCM;
+ common_hdr->flags = (mep->rdi << 7) |
+ interval_to_pdu(mep->cc_config.exp_interval);
+ common_hdr->tlv_offset = CFM_CCM_TLV_OFFSET;
+
+ /* Sequence number */
+ snumber = skb_put(skb, sizeof(*snumber));
+ if (tx_info->seq_no_update) {
+ *snumber = cpu_to_be32(mep->ccm_tx_snumber);
+ mep->ccm_tx_snumber += 1;
+ } else {
+ *snumber = 0;
+ }
+
+ mepid = skb_put(skb, sizeof(*mepid));
+ *mepid = cpu_to_be16((u16)mep->config.mepid);
+
+ maid = skb_put(skb, sizeof(*maid));
+ memcpy(maid->data, mep->cc_config.exp_maid.data, sizeof(maid->data));
+
+ /* ITU reserved (CFM_CCM_ITU_RESERVED_SIZE octets) */
+ itu_reserved = skb_put(skb, CFM_CCM_ITU_RESERVED_SIZE);
+ memset(itu_reserved, 0, CFM_CCM_ITU_RESERVED_SIZE);
+
+ /* Generel CFM TLV format:
+ * TLV type: one byte
+ * TLV value length: two bytes
+ * TLV value: 'TLV value length' bytes
+ */
+
+ /* Port status TLV. The value length is 1. Total of 4 bytes. */
+ if (tx_info->port_tlv) {
+ status_tlv = skb_put(skb, sizeof(*status_tlv));
+ *status_tlv = cpu_to_be32((CFM_PORT_STATUS_TLV_TYPE << 24) |
+ (1 << 8) | /* Value length */
+ (tx_info->port_tlv_value & 0xFF));
+ }
+
+ /* Interface status TLV. The value length is 1. Total of 4 bytes. */
+ if (tx_info->if_tlv) {
+ status_tlv = skb_put(skb, sizeof(*status_tlv));
+ *status_tlv = cpu_to_be32((CFM_IF_STATUS_TLV_TYPE << 24) |
+ (1 << 8) | /* Value length */
+ (tx_info->if_tlv_value & 0xFF));
+ }
+
+ /* End TLV */
+ e_tlv = skb_put(skb, sizeof(*e_tlv));
+ *e_tlv = CFM_ENDE_TLV_TYPE;
+
+ return skb;
+}
+
+static void ccm_frame_tx(struct sk_buff *skb)
+{
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+}
+
+/* This function is called with the configured CC 'expected_interval'
+ * in order to drive CCM transmission when enabled.
+ */
+static void ccm_tx_work_expired(struct work_struct *work)
+{
+ struct delayed_work *del_work;
+ struct br_cfm_mep *mep;
+ struct sk_buff *skb;
+ u32 interval_us;
+
+ del_work = to_delayed_work(work);
+ mep = container_of(del_work, struct br_cfm_mep, ccm_tx_dwork);
+
+ if (time_before_eq(mep->ccm_tx_end, jiffies)) {
+ /* Transmission period has ended */
+ mep->cc_ccm_tx_info.period = 0;
+ return;
+ }
+
+ skb = ccm_frame_build(mep, &mep->cc_ccm_tx_info);
+ if (skb)
+ ccm_frame_tx(skb);
+
+ interval_us = interval_to_us(mep->cc_config.exp_interval);
+ queue_delayed_work(system_wq, &mep->ccm_tx_dwork,
+ usecs_to_jiffies(interval_us));
+}
+
+/* This function is called with 1/4 of the configured CC 'expected_interval'
+ * in order to detect CCM defect after 3.25 interval.
+ */
+static void ccm_rx_work_expired(struct work_struct *work)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct net_bridge_port *b_port;
+ struct delayed_work *del_work;
+
+ del_work = to_delayed_work(work);
+ peer_mep = container_of(del_work, struct br_cfm_peer_mep, ccm_rx_dwork);
+
+ /* After 13 counts (4 * 3,25) then 3.25 intervals are expired */
+ if (peer_mep->ccm_rx_count_miss < 13) {
+ /* 3.25 intervals are NOT expired without CCM reception */
+ peer_mep->ccm_rx_count_miss++;
+
+ /* Start timer again */
+ ccm_rx_timer_start(peer_mep);
+ } else {
+ /* 3.25 intervals are expired without CCM reception.
+ * CCM defect detected
+ */
+ peer_mep->cc_status.ccm_defect = true;
+
+ /* Change in CCM defect status - notify */
+ rcu_read_lock();
+ b_port = rcu_dereference(peer_mep->mep->b_port);
+ if (b_port)
+ br_cfm_notify(RTM_NEWLINK, b_port);
+ rcu_read_unlock();
+ }
+}
+
+static u32 ccm_tlv_extract(struct sk_buff *skb, u32 index,
+ struct br_cfm_peer_mep *peer_mep)
+{
+ __be32 *s_tlv;
+ __be32 _s_tlv;
+ u32 h_s_tlv;
+ u8 *e_tlv;
+ u8 _e_tlv;
+
+ e_tlv = skb_header_pointer(skb, index, sizeof(_e_tlv), &_e_tlv);
+ if (!e_tlv)
+ return 0;
+
+ /* TLV is present - get the status TLV */
+ s_tlv = skb_header_pointer(skb,
+ index,
+ sizeof(_s_tlv), &_s_tlv);
+ if (!s_tlv)
+ return 0;
+
+ h_s_tlv = ntohl(*s_tlv);
+ if ((h_s_tlv >> 24) == CFM_IF_STATUS_TLV_TYPE) {
+ /* Interface status TLV */
+ peer_mep->cc_status.tlv_seen = true;
+ peer_mep->cc_status.if_tlv_value = (h_s_tlv & 0xFF);
+ }
+
+ if ((h_s_tlv >> 24) == CFM_PORT_STATUS_TLV_TYPE) {
+ /* Port status TLV */
+ peer_mep->cc_status.tlv_seen = true;
+ peer_mep->cc_status.port_tlv_value = (h_s_tlv & 0xFF);
+ }
+
+ /* The Sender ID TLV is not handled */
+ /* The Organization-Specific TLV is not handled */
+
+ /* Return the length of this tlv.
+ * This is the length of the value field plus 3 bytes for size of type
+ * field and length field
+ */
+ return ((h_s_tlv >> 8) & 0xFFFF) + 3;
+}
+
+/* note: already called with rcu_read_lock */
+static int br_cfm_frame_rx(struct net_bridge_port *port, struct sk_buff *skb)
+{
+ u32 mdlevel, interval, size, index, max;
+ const struct br_cfm_common_hdr *hdr;
+ struct br_cfm_peer_mep *peer_mep;
+ const struct br_cfm_maid *maid;
+ struct br_cfm_common_hdr _hdr;
+ struct br_cfm_maid _maid;
+ struct br_cfm_mep *mep;
+ struct net_bridge *br;
+ __be32 *snumber;
+ __be32 _snumber;
+ __be16 *mepid;
+ __be16 _mepid;
+
+ if (port->state == BR_STATE_DISABLED)
+ return 0;
+
+ hdr = skb_header_pointer(skb, 0, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return 1;
+
+ br = port->br;
+ mep = br_mep_find_ifindex(br, port->dev->ifindex);
+ if (unlikely(!mep))
+ /* No MEP on this port - must be forwarded */
+ return 0;
+
+ mdlevel = hdr->mdlevel_version >> 5;
+ if (mdlevel > mep->config.mdlevel)
+ /* The level is above this MEP level - must be forwarded */
+ return 0;
+
+ if ((hdr->mdlevel_version & 0x1F) != 0) {
+ /* Invalid version */
+ mep->status.version_unexp_seen = true;
+ return 1;
+ }
+
+ if (mdlevel < mep->config.mdlevel) {
+ /* The level is below this MEP level */
+ mep->status.rx_level_low_seen = true;
+ return 1;
+ }
+
+ if (hdr->opcode == BR_CFM_OPCODE_CCM) {
+ /* CCM PDU received. */
+ /* MA ID is after common header + sequence number + MEP ID */
+ maid = skb_header_pointer(skb,
+ CFM_CCM_PDU_MAID_OFFSET,
+ sizeof(_maid), &_maid);
+ if (!maid)
+ return 1;
+ if (memcmp(maid->data, mep->cc_config.exp_maid.data,
+ sizeof(maid->data)))
+ /* MA ID not as expected */
+ return 1;
+
+ /* MEP ID is after common header + sequence number */
+ mepid = skb_header_pointer(skb,
+ CFM_CCM_PDU_MEPID_OFFSET,
+ sizeof(_mepid), &_mepid);
+ if (!mepid)
+ return 1;
+ peer_mep = br_peer_mep_find(mep, (u32)ntohs(*mepid));
+ if (!peer_mep)
+ return 1;
+
+ /* Interval is in common header flags */
+ interval = hdr->flags & 0x07;
+ if (mep->cc_config.exp_interval != pdu_to_interval(interval))
+ /* Interval not as expected */
+ return 1;
+
+ /* A valid CCM frame is received */
+ if (peer_mep->cc_status.ccm_defect) {
+ peer_mep->cc_status.ccm_defect = false;
+
+ /* Change in CCM defect status - notify */
+ br_cfm_notify(RTM_NEWLINK, port);
+
+ /* Start CCM RX timer */
+ ccm_rx_timer_start(peer_mep);
+ }
+
+ peer_mep->cc_status.seen = true;
+ peer_mep->ccm_rx_count_miss = 0;
+
+ /* RDI is in common header flags */
+ peer_mep->cc_status.rdi = (hdr->flags & 0x80) ? true : false;
+
+ /* Sequence number is after common header */
+ snumber = skb_header_pointer(skb,
+ CFM_CCM_PDU_SEQNR_OFFSET,
+ sizeof(_snumber), &_snumber);
+ if (!snumber)
+ return 1;
+ if (ntohl(*snumber) != (mep->ccm_rx_snumber + 1))
+ /* Unexpected sequence number */
+ peer_mep->cc_status.seq_unexp_seen = true;
+
+ mep->ccm_rx_snumber = ntohl(*snumber);
+
+ /* TLV end is after common header + sequence number + MEP ID +
+ * MA ID + ITU reserved
+ */
+ index = CFM_CCM_PDU_TLV_OFFSET;
+ max = 0;
+ do { /* Handle all TLVs */
+ size = ccm_tlv_extract(skb, index, peer_mep);
+ index += size;
+ max += 1;
+ } while (size != 0 && max < 4); /* Max four TLVs possible */
+
+ return 1;
+ }
+
+ mep->status.opcode_unexp_seen = true;
+
+ return 1;
+}
+
+static struct br_frame_type cfm_frame_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CFM),
+ .frame_handler = br_cfm_frame_rx,
+};
+
+int br_cfm_mep_create(struct net_bridge *br,
+ const u32 instance,
+ struct br_cfm_mep_create *const create,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port *p;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ if (create->domain == BR_CFM_VLAN) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "VLAN domain not supported");
+ return -EINVAL;
+ }
+ if (create->domain != BR_CFM_PORT) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid domain value");
+ return -EINVAL;
+ }
+ if (create->direction == BR_CFM_MEP_DIRECTION_UP) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Up-MEP not supported");
+ return -EINVAL;
+ }
+ if (create->direction != BR_CFM_MEP_DIRECTION_DOWN) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid direction value");
+ return -EINVAL;
+ }
+ p = br_mep_get_port(br, create->ifindex);
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Port is not related to bridge");
+ return -EINVAL;
+ }
+ mep = br_mep_find(br, instance);
+ if (mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance already exists");
+ return -EEXIST;
+ }
+
+ /* In PORT domain only one instance can be created per port */
+ if (create->domain == BR_CFM_PORT) {
+ mep = br_mep_find_ifindex(br, create->ifindex);
+ if (mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only one Port MEP on a port allowed");
+ return -EINVAL;
+ }
+ }
+
+ mep = kzalloc(sizeof(*mep), GFP_KERNEL);
+ if (!mep)
+ return -ENOMEM;
+
+ mep->create = *create;
+ mep->instance = instance;
+ rcu_assign_pointer(mep->b_port, p);
+
+ INIT_HLIST_HEAD(&mep->peer_mep_list);
+ INIT_DELAYED_WORK(&mep->ccm_tx_dwork, ccm_tx_work_expired);
+
+ if (hlist_empty(&br->mep_list))
+ br_add_frame(br, &cfm_frame_type);
+
+ hlist_add_tail_rcu(&mep->head, &br->mep_list);
+
+ return 0;
+}
+
+static void mep_delete_implementation(struct net_bridge *br,
+ struct br_cfm_mep *mep)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct hlist_node *n_store;
+
+ ASSERT_RTNL();
+
+ /* Empty and free peer MEP list */
+ hlist_for_each_entry_safe(peer_mep, n_store, &mep->peer_mep_list, head) {
+ cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork);
+ hlist_del_rcu(&peer_mep->head);
+ kfree_rcu(peer_mep, rcu);
+ }
+
+ cancel_delayed_work_sync(&mep->ccm_tx_dwork);
+
+ RCU_INIT_POINTER(mep->b_port, NULL);
+ hlist_del_rcu(&mep->head);
+ kfree_rcu(mep, rcu);
+
+ if (hlist_empty(&br->mep_list))
+ br_del_frame(br, &cfm_frame_type);
+}
+
+int br_cfm_mep_delete(struct net_bridge *br,
+ const u32 instance,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ mep_delete_implementation(br, mep);
+
+ return 0;
+}
+
+int br_cfm_mep_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_mep_config *const config,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ mep->config = *config;
+
+ return 0;
+}
+
+int br_cfm_cc_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_cc_config *const config,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ /* Check for no change in configuration */
+ if (memcmp(config, &mep->cc_config, sizeof(*config)) == 0)
+ return 0;
+
+ if (config->enable && !mep->cc_config.enable)
+ /* CC is enabled */
+ hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head)
+ cc_peer_enable(peer_mep);
+
+ if (!config->enable && mep->cc_config.enable)
+ /* CC is disabled */
+ hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head)
+ cc_peer_disable(peer_mep);
+
+ mep->cc_config = *config;
+ mep->ccm_rx_snumber = 0;
+ mep->ccm_tx_snumber = 1;
+
+ return 0;
+}
+
+int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance,
+ u32 mepid,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ peer_mep = br_peer_mep_find(mep, mepid);
+ if (peer_mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Peer MEP-ID already exists");
+ return -EEXIST;
+ }
+
+ peer_mep = kzalloc(sizeof(*peer_mep), GFP_KERNEL);
+ if (!peer_mep)
+ return -ENOMEM;
+
+ peer_mep->mepid = mepid;
+ peer_mep->mep = mep;
+ INIT_DELAYED_WORK(&peer_mep->ccm_rx_dwork, ccm_rx_work_expired);
+
+ if (mep->cc_config.enable)
+ cc_peer_enable(peer_mep);
+
+ hlist_add_tail_rcu(&peer_mep->head, &mep->peer_mep_list);
+
+ return 0;
+}
+
+int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance,
+ u32 mepid,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ peer_mep = br_peer_mep_find(mep, mepid);
+ if (!peer_mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Peer MEP-ID does not exists");
+ return -ENOENT;
+ }
+
+ cc_peer_disable(peer_mep);
+
+ hlist_del_rcu(&peer_mep->head);
+ kfree_rcu(peer_mep, rcu);
+
+ return 0;
+}
+
+int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance,
+ const bool rdi, struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ mep->rdi = rdi;
+
+ return 0;
+}
+
+int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance,
+ const struct br_cfm_cc_ccm_tx_info *const tx_info,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ if (memcmp(tx_info, &mep->cc_ccm_tx_info, sizeof(*tx_info)) == 0) {
+ /* No change in tx_info. */
+ if (mep->cc_ccm_tx_info.period == 0)
+ /* Transmission is not enabled - just return */
+ return 0;
+
+ /* Transmission is ongoing, the end time is recalculated */
+ mep->ccm_tx_end = jiffies +
+ usecs_to_jiffies(tx_info->period * 1000000);
+ return 0;
+ }
+
+ if (tx_info->period == 0 && mep->cc_ccm_tx_info.period == 0)
+ /* Some change in info and transmission is not ongoing */
+ goto save;
+
+ if (tx_info->period != 0 && mep->cc_ccm_tx_info.period != 0) {
+ /* Some change in info and transmission is ongoing
+ * The end time is recalculated
+ */
+ mep->ccm_tx_end = jiffies +
+ usecs_to_jiffies(tx_info->period * 1000000);
+
+ goto save;
+ }
+
+ if (tx_info->period == 0 && mep->cc_ccm_tx_info.period != 0) {
+ cancel_delayed_work_sync(&mep->ccm_tx_dwork);
+ goto save;
+ }
+
+ /* Start delayed work to transmit CCM frames. It is done with zero delay
+ * to send first frame immediately
+ */
+ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000);
+ queue_delayed_work(system_wq, &mep->ccm_tx_dwork, 0);
+
+save:
+ mep->cc_ccm_tx_info = *tx_info;
+
+ return 0;
+}
+
+int br_cfm_mep_count(struct net_bridge *br, u32 *count)
+{
+ struct br_cfm_mep *mep;
+
+ *count = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head)
+ *count += 1;
+ rcu_read_unlock();
+
+ return 0;
+}
+
+int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ *count = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head)
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head)
+ *count += 1;
+ rcu_read_unlock();
+
+ return 0;
+}
+
+bool br_cfm_created(struct net_bridge *br)
+{
+ return !hlist_empty(&br->mep_list);
+}
+
+/* Deletes the CFM instances on a specific bridge port
+ */
+void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *port)
+{
+ struct hlist_node *n_store;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ hlist_for_each_entry_safe(mep, n_store, &br->mep_list, head)
+ if (mep->create.ifindex == port->dev->ifindex)
+ mep_delete_implementation(br, mep);
+}
diff --git a/net/bridge/br_cfm_netlink.c b/net/bridge/br_cfm_netlink.c
new file mode 100644
index 000000000000..5c4c369f8536
--- /dev/null
+++ b/net/bridge/br_cfm_netlink.c
@@ -0,0 +1,726 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/genetlink.h>
+
+#include "br_private.h"
+#include "br_private_cfm.h"
+
+static const struct nla_policy
+br_cfm_mep_create_policy[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_MEP_CREATE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+br_cfm_mep_delete_policy[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_MEP_DELETE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+br_cfm_mep_config_policy[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC] = NLA_POLICY_ETH_ADDR,
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL] = NLA_POLICY_MAX(NLA_U32, 7),
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF),
+};
+
+static const struct nla_policy
+br_cfm_cc_config_policy[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_CONFIG_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID] = {
+ .type = NLA_BINARY, .len = CFM_MAID_LENGTH },
+};
+
+static const struct nla_policy
+br_cfm_cc_peer_mep_policy[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_PEER_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF),
+};
+
+static const struct nla_policy
+br_cfm_cc_rdi_policy[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_RDI_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_RDI_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_RDI_RDI] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+br_cfm_cc_ccm_tx_policy[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC] = NLA_POLICY_ETH_ADDR,
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE] = { .type = NLA_U8 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE] = { .type = NLA_U8 },
+};
+
+static const struct nla_policy
+br_cfm_policy[IFLA_BRIDGE_CFM_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_CREATE] =
+ NLA_POLICY_NESTED(br_cfm_mep_create_policy),
+ [IFLA_BRIDGE_CFM_MEP_DELETE] =
+ NLA_POLICY_NESTED(br_cfm_mep_delete_policy),
+ [IFLA_BRIDGE_CFM_MEP_CONFIG] =
+ NLA_POLICY_NESTED(br_cfm_mep_config_policy),
+ [IFLA_BRIDGE_CFM_CC_CONFIG] =
+ NLA_POLICY_NESTED(br_cfm_cc_config_policy),
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD] =
+ NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy),
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE] =
+ NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy),
+ [IFLA_BRIDGE_CFM_CC_RDI] =
+ NLA_POLICY_NESTED(br_cfm_cc_rdi_policy),
+ [IFLA_BRIDGE_CFM_CC_CCM_TX] =
+ NLA_POLICY_NESTED(br_cfm_cc_ccm_tx_policy),
+};
+
+static int br_mep_create_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1];
+ struct br_cfm_mep_create create;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CREATE_MAX, attr,
+ br_cfm_mep_create_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing DOMAIN attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing DIRECTION attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing IFINDEX attribute");
+ return -EINVAL;
+ }
+
+ memset(&create, 0, sizeof(create));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]);
+ create.domain = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]);
+ create.direction = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]);
+ create.ifindex = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]);
+
+ return br_cfm_mep_create(br, instance, &create, extack);
+}
+
+static int br_mep_delete_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1];
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_DELETE_MAX, attr,
+ br_cfm_mep_delete_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]);
+
+ return br_cfm_mep_delete(br, instance, extack);
+}
+
+static int br_mep_config_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1];
+ struct br_cfm_mep_config config;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CONFIG_MAX, attr,
+ br_cfm_mep_config_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing UNICAST_MAC attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing MDLEVEL attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing MEPID attribute");
+ return -EINVAL;
+ }
+
+ memset(&config, 0, sizeof(config));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]);
+ nla_memcpy(&config.unicast_mac.addr,
+ tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC],
+ sizeof(config.unicast_mac.addr));
+ config.mdlevel = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]);
+ config.mepid = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]);
+
+ return br_cfm_mep_config_set(br, instance, &config, extack);
+}
+
+static int br_cc_config_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1];
+ struct br_cfm_cc_config config;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CONFIG_MAX, attr,
+ br_cfm_cc_config_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing ENABLE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INTERVAL attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing MAID attribute");
+ return -EINVAL;
+ }
+
+ memset(&config, 0, sizeof(config));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]);
+ config.enable = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]);
+ config.exp_interval = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]);
+ nla_memcpy(&config.exp_maid.data, tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID],
+ sizeof(config.exp_maid.data));
+
+ return br_cfm_cc_config_set(br, instance, &config, extack);
+}
+
+static int br_cc_peer_mep_add_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1];
+ u32 instance, peer_mep_id;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr,
+ br_cfm_cc_peer_mep_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]);
+ peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]);
+
+ return br_cfm_cc_peer_mep_add(br, instance, peer_mep_id, extack);
+}
+
+static int br_cc_peer_mep_remove_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1];
+ u32 instance, peer_mep_id;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr,
+ br_cfm_cc_peer_mep_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]);
+ peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]);
+
+ return br_cfm_cc_peer_mep_remove(br, instance, peer_mep_id, extack);
+}
+
+static int br_cc_rdi_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1];
+ u32 instance, rdi;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_RDI_MAX, attr,
+ br_cfm_cc_rdi_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing RDI attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]);
+ rdi = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]);
+
+ return br_cfm_cc_rdi_set(br, instance, rdi, extack);
+}
+
+static int br_cc_ccm_tx_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1];
+ struct br_cfm_cc_ccm_tx_info tx_info;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CCM_TX_MAX, attr,
+ br_cfm_cc_ccm_tx_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing DMAC attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing SEQ_NO_UPDATE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PERIOD attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV_VALUE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV_VALUE attribute");
+ return -EINVAL;
+ }
+
+ memset(&tx_info, 0, sizeof(tx_info));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]);
+ nla_memcpy(&tx_info.dmac.addr,
+ tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC],
+ sizeof(tx_info.dmac.addr));
+ tx_info.seq_no_update = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]);
+ tx_info.period = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]);
+ tx_info.if_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]);
+ tx_info.if_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]);
+ tx_info.port_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]);
+ tx_info.port_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]);
+
+ return br_cfm_cc_ccm_tx(br, instance, &tx_info, extack);
+}
+
+int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MAX + 1];
+ int err;
+
+ /* When this function is called for a port then the br pointer is
+ * invalid, therefor set the br to point correctly
+ */
+ if (p)
+ br = p->br;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MAX, attr,
+ br_cfm_policy, extack);
+ if (err)
+ return err;
+
+ if (tb[IFLA_BRIDGE_CFM_MEP_CREATE]) {
+ err = br_mep_create_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CREATE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_MEP_DELETE]) {
+ err = br_mep_delete_parse(br, tb[IFLA_BRIDGE_CFM_MEP_DELETE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_MEP_CONFIG]) {
+ err = br_mep_config_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CONFIG],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_CONFIG]) {
+ err = br_cc_config_parse(br, tb[IFLA_BRIDGE_CFM_CC_CONFIG],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD]) {
+ err = br_cc_peer_mep_add_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE]) {
+ err = br_cc_peer_mep_remove_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_RDI]) {
+ err = br_cc_rdi_parse(br, tb[IFLA_BRIDGE_CFM_CC_RDI],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_CCM_TX]) {
+ err = br_cc_ccm_tx_parse(br, tb[IFLA_BRIDGE_CFM_CC_CCM_TX],
+ extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+ struct nlattr *tb;
+
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head) {
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN,
+ mep->create.domain))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION,
+ mep->create.direction))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX,
+ mep->create.ifindex))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC,
+ sizeof(mep->config.unicast_mac.addr),
+ mep->config.unicast_mac.addr))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL,
+ mep->config.mdlevel))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID,
+ mep->config.mepid))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE,
+ mep->cc_config.enable))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL,
+ mep->cc_config.exp_interval))
+ goto nla_put_failure;
+
+ if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID,
+ sizeof(mep->cc_config.exp_maid.data),
+ mep->cc_config.exp_maid.data))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_RDI_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_RDI,
+ mep->rdi))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC,
+ sizeof(mep->cc_ccm_tx_info.dmac),
+ mep->cc_ccm_tx_info.dmac.addr))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE,
+ mep->cc_ccm_tx_info.seq_no_update))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD,
+ mep->cc_ccm_tx_info.period))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV,
+ mep->cc_ccm_tx_info.if_tlv))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE,
+ mep->cc_ccm_tx_info.if_tlv_value))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV,
+ mep->cc_ccm_tx_info.port_tlv))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE,
+ mep->cc_ccm_tx_info.port_tlv_value))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) {
+ tb = nla_nest_start(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEPID,
+ peer_mep->mepid))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+ }
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, tb);
+
+nla_info_failure:
+ return -EMSGSIZE;
+}
+
+int br_cfm_status_fill_info(struct sk_buff *skb,
+ struct net_bridge *br,
+ bool getlink)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+ struct nlattr *tb;
+
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head) {
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN,
+ mep->status.opcode_unexp_seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN,
+ mep->status.version_unexp_seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN,
+ mep->status.rx_level_low_seen))
+ goto nla_put_failure;
+
+ /* Only clear if this is a GETLINK */
+ if (getlink) {
+ /* Clear all 'seen' indications */
+ mep->status.opcode_unexp_seen = false;
+ mep->status.version_unexp_seen = false;
+ mep->status.rx_level_low_seen = false;
+ }
+
+ nla_nest_end(skb, tb);
+
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) {
+ tb = nla_nest_start(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID,
+ peer_mep->mepid))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT,
+ peer_mep->cc_status.ccm_defect))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI,
+ peer_mep->cc_status.rdi))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE,
+ peer_mep->cc_status.port_tlv_value))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE,
+ peer_mep->cc_status.if_tlv_value))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN,
+ peer_mep->cc_status.seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN,
+ peer_mep->cc_status.tlv_seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN,
+ peer_mep->cc_status.seq_unexp_seen))
+ goto nla_put_failure;
+
+ if (getlink) { /* Only clear if this is a GETLINK */
+ /* Clear all 'seen' indications */
+ peer_mep->cc_status.seen = false;
+ peer_mep->cc_status.tlv_seen = false;
+ peer_mep->cc_status.seq_unexp_seen = false;
+ }
+
+ nla_nest_end(skb, tb);
+ }
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, tb);
+
+nla_info_failure:
+ return -EMSGSIZE;
+}
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 9a2fb4aa1a10..3f2f06b4dd27 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -30,7 +30,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_fdb_entry *dst;
struct net_bridge_mdb_entry *mdst;
- struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
const struct nf_br_ops *nf_ops;
u8 state = BR_STATE_FORWARDING;
const unsigned char *dest;
@@ -45,10 +44,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
- u64_stats_update_begin(&brstats->syncp);
- brstats->tx_packets++;
- brstats->tx_bytes += skb->len;
- u64_stats_update_end(&brstats->syncp);
+ dev_sw_netstats_tx_add(dev, 1, skb->len);
br_switchdev_frame_unmark(skb);
BR_INPUT_SKB_CB(skb)->brdev = dev;
@@ -93,7 +89,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
- br_multicast_querier_exists(br, eth_hdr(skb)))
+ br_multicast_querier_exists(br, eth_hdr(skb), mdst))
br_multicast_flood(mdst, skb, false, true);
else
br_flood(br, skb, BR_PKT_MULTICAST, false, true);
@@ -119,26 +115,26 @@ static int br_dev_init(struct net_device *dev)
struct net_bridge *br = netdev_priv(dev);
int err;
- br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!br->stats)
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
return -ENOMEM;
err = br_fdb_hash_init(br);
if (err) {
- free_percpu(br->stats);
+ free_percpu(dev->tstats);
return err;
}
err = br_mdb_hash_init(br);
if (err) {
- free_percpu(br->stats);
+ free_percpu(dev->tstats);
br_fdb_hash_fini(br);
return err;
}
err = br_vlan_init(br);
if (err) {
- free_percpu(br->stats);
+ free_percpu(dev->tstats);
br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
return err;
@@ -146,7 +142,7 @@ static int br_dev_init(struct net_device *dev)
err = br_multicast_init_stats(br);
if (err) {
- free_percpu(br->stats);
+ free_percpu(dev->tstats);
br_vlan_flush(br);
br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
@@ -165,7 +161,7 @@ static void br_dev_uninit(struct net_device *dev)
br_vlan_flush(br);
br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
- free_percpu(br->stats);
+ free_percpu(dev->tstats);
}
static int br_dev_open(struct net_device *dev)
@@ -177,6 +173,9 @@ static int br_dev_open(struct net_device *dev)
br_stp_enable_bridge(br);
br_multicast_open(br);
+ if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ br_multicast_join_snoopers(br);
+
return 0;
}
@@ -197,38 +196,14 @@ static int br_dev_stop(struct net_device *dev)
br_stp_disable_bridge(br);
br_multicast_stop(br);
+ if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ br_multicast_leave_snoopers(br);
+
netif_stop_queue(dev);
return 0;
}
-static void br_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *stats)
-{
- struct net_bridge *br = netdev_priv(dev);
- struct pcpu_sw_netstats tmp, sum = { 0 };
- unsigned int cpu;
-
- for_each_possible_cpu(cpu) {
- unsigned int start;
- const struct pcpu_sw_netstats *bstats
- = per_cpu_ptr(br->stats, cpu);
- do {
- start = u64_stats_fetch_begin_irq(&bstats->syncp);
- memcpy(&tmp, bstats, sizeof(tmp));
- } while (u64_stats_fetch_retry_irq(&bstats->syncp, start));
- sum.tx_bytes += tmp.tx_bytes;
- sum.tx_packets += tmp.tx_packets;
- sum.rx_bytes += tmp.rx_bytes;
- sum.rx_packets += tmp.rx_packets;
- }
-
- stats->tx_bytes = sum.tx_bytes;
- stats->tx_packets = sum.tx_packets;
- stats->rx_bytes = sum.rx_bytes;
- stats->rx_packets = sum.rx_packets;
-}
-
static int br_change_mtu(struct net_device *dev, int new_mtu)
{
struct net_bridge *br = netdev_priv(dev);
@@ -422,7 +397,7 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_init = br_dev_init,
.ndo_uninit = br_dev_uninit,
.ndo_start_xmit = br_dev_xmit,
- .ndo_get_stats64 = br_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_set_mac_address = br_set_mac_address,
.ndo_set_rx_mode = br_dev_set_multicast_list,
.ndo_change_rx_flags = br_dev_change_rx_flags,
@@ -473,8 +448,12 @@ void br_dev_setup(struct net_device *dev)
spin_lock_init(&br->lock);
INIT_LIST_HEAD(&br->port_list);
INIT_HLIST_HEAD(&br->fdb_list);
+ INIT_HLIST_HEAD(&br->frame_type_list);
#if IS_ENABLED(CONFIG_BRIDGE_MRP)
- INIT_LIST_HEAD(&br->mrp_list);
+ INIT_HLIST_HEAD(&br->mrp_list);
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+ INIT_HLIST_HEAD(&br->mep_list);
#endif
spin_lock_init(&br->hash_lock);
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7629b63f6f30..e28ffadd1371 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -274,14 +274,23 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_port *prev = NULL;
struct net_bridge_port_group *p;
+ bool allow_mode_include = true;
struct hlist_node *rp;
rp = rcu_dereference(hlist_first_rcu(&br->router_list));
- p = mdst ? rcu_dereference(mdst->ports) : NULL;
+ if (mdst) {
+ p = rcu_dereference(mdst->ports);
+ if (br_multicast_should_handle_mode(br, mdst->addr.proto) &&
+ br_multicast_is_star_g(&mdst->addr))
+ allow_mode_include = false;
+ } else {
+ p = NULL;
+ }
+
while (p || rp) {
struct net_bridge_port *port, *lport, *rport;
- lport = p ? p->port : NULL;
+ lport = p ? p->key.port : NULL;
rport = hlist_entry_safe(rp, struct net_bridge_port, rlist);
if ((unsigned long)lport > (unsigned long)rport) {
@@ -292,6 +301,10 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
local_orig);
goto delivered;
}
+ if ((!allow_mode_include &&
+ p->filter_mode == MCAST_INCLUDE) ||
+ (p->flags & MDB_PG_FLAGS_BLOCKED))
+ goto delivered;
} else {
port = rport;
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index a0e9a7937412..f7d2f472ae24 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -334,6 +334,7 @@ static void del_nbp(struct net_bridge_port *p)
spin_unlock_bh(&br->lock);
br_mrp_port_del(br, p);
+ br_cfm_port_del(br, p);
br_ifinfo_notify(RTM_DELLINK, NULL, p);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 59a318b9f646..8ca1f1bc6d12 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -35,12 +35,8 @@ static int br_pass_frame_up(struct sk_buff *skb)
struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
struct net_bridge *br = netdev_priv(brdev);
struct net_bridge_vlan_group *vg;
- struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
- u64_stats_update_begin(&brstats->syncp);
- brstats->rx_packets++;
- brstats->rx_bytes += skb->len;
- u64_stats_update_end(&brstats->syncp);
+ dev_sw_netstats_rx_add(brdev, skb->len);
vg = br_vlan_group_rcu(br);
/* Bridge is just like any other port. Make sure the
@@ -134,7 +130,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
case BR_PKT_MULTICAST:
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
- br_multicast_querier_exists(br, eth_hdr(skb))) {
+ br_multicast_querier_exists(br, eth_hdr(skb), mdst)) {
if ((mdst && mdst->host_joined) ||
br_multicast_is_router(br)) {
local_rcv = true;
@@ -254,6 +250,21 @@ frame_finish:
return RX_HANDLER_CONSUMED;
}
+/* Return 0 if the frame was not processed otherwise 1
+ * note: already called with rcu_read_lock
+ */
+static int br_process_frame_type(struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ struct br_frame_type *tmp;
+
+ hlist_for_each_entry_rcu(tmp, &p->br->frame_type_list, list)
+ if (unlikely(tmp->type == skb->protocol))
+ return tmp->frame_handler(p, skb);
+
+ return 0;
+}
+
/*
* Return NULL if skb is handled
* note: already called with rcu_read_lock
@@ -343,7 +354,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
}
}
- if (unlikely(br_mrp_process(p, skb)))
+ if (unlikely(br_process_frame_type(p, skb)))
return RX_HANDLER_PASS;
forward:
@@ -380,3 +391,19 @@ rx_handler_func_t *br_get_rx_handler(const struct net_device *dev)
return br_handle_frame;
}
+
+void br_add_frame(struct net_bridge *br, struct br_frame_type *ft)
+{
+ hlist_add_head_rcu(&ft->list, &br->frame_type_list);
+}
+
+void br_del_frame(struct net_bridge *br, struct br_frame_type *ft)
+{
+ struct br_frame_type *tmp;
+
+ hlist_for_each_entry(tmp, &br->frame_type_list, list)
+ if (ft == tmp) {
+ hlist_del_rcu(&ft->list);
+ return;
+ }
+}
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 5e71fc8b826f..2db800fc27ca 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -103,7 +103,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
/*
* Legacy ioctl's through SIOCDEVPRIVATE
- * This interface is deprecated because it was too difficult to
+ * This interface is deprecated because it was too difficult
* to do the translation for 32/64bit ioctl compatibility.
*/
static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index da5ed4cf9233..8846c5bcd075 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -62,25 +62,98 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
e->flags |= MDB_FLAGS_OFFLOAD;
if (flags & MDB_PG_FLAGS_FAST_LEAVE)
e->flags |= MDB_FLAGS_FAST_LEAVE;
+ if (flags & MDB_PG_FLAGS_STAR_EXCL)
+ e->flags |= MDB_FLAGS_STAR_EXCL;
+ if (flags & MDB_PG_FLAGS_BLOCKED)
+ e->flags |= MDB_FLAGS_BLOCKED;
}
-static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
+static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip,
+ struct nlattr **mdb_attrs)
{
memset(ip, 0, sizeof(struct br_ip));
ip->vid = entry->vid;
ip->proto = entry->addr.proto;
- if (ip->proto == htons(ETH_P_IP))
- ip->u.ip4 = entry->addr.u.ip4;
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ ip->dst.ip4 = entry->addr.u.ip4;
+ if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE])
+ ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
- ip->u.ip6 = entry->addr.u.ip6;
+ case htons(ETH_P_IPV6):
+ ip->dst.ip6 = entry->addr.u.ip6;
+ if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE])
+ ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]);
+ break;
#endif
+ default:
+ ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr);
+ }
+
+}
+
+static int __mdb_fill_srcs(struct sk_buff *skb,
+ struct net_bridge_port_group *p)
+{
+ struct net_bridge_group_src *ent;
+ struct nlattr *nest, *nest_ent;
+
+ if (hlist_empty(&p->src_list))
+ return 0;
+
+ nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST);
+ if (!nest)
+ return -EMSGSIZE;
+
+ hlist_for_each_entry_rcu(ent, &p->src_list, node,
+ lockdep_is_held(&p->key.port->br->multicast_lock)) {
+ nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY);
+ if (!nest_ent)
+ goto out_cancel_err;
+ switch (ent->addr.proto) {
+ case htons(ETH_P_IP):
+ if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS,
+ ent->addr.src.ip4)) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS,
+ &ent->addr.src.ip6)) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ break;
+#endif
+ default:
+ nla_nest_cancel(skb, nest_ent);
+ continue;
+ }
+ if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER,
+ br_timer_value(&ent->timer))) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ nla_nest_end(skb, nest_ent);
+ }
+
+ nla_nest_end(skb, nest);
+
+ return 0;
+
+out_cancel_err:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
}
static int __mdb_fill_info(struct sk_buff *skb,
struct net_bridge_mdb_entry *mp,
struct net_bridge_port_group *p)
{
+ bool dump_srcs_mode = false;
struct timer_list *mtimer;
struct nlattr *nest_ent;
struct br_mdb_entry e;
@@ -89,7 +162,7 @@ static int __mdb_fill_info(struct sk_buff *skb,
memset(&e, 0, sizeof(e));
if (p) {
- ifindex = p->port->dev->ifindex;
+ ifindex = p->key.port->dev->ifindex;
mtimer = &p->timer;
flags = p->flags;
} else {
@@ -101,11 +174,13 @@ static int __mdb_fill_info(struct sk_buff *skb,
e.ifindex = ifindex;
e.vid = mp->addr.vid;
if (mp->addr.proto == htons(ETH_P_IP))
- e.addr.u.ip4 = mp->addr.u.ip4;
+ e.addr.u.ip4 = mp->addr.dst.ip4;
#if IS_ENABLED(CONFIG_IPV6)
- if (mp->addr.proto == htons(ETH_P_IPV6))
- e.addr.u.ip6 = mp->addr.u.ip6;
+ else if (mp->addr.proto == htons(ETH_P_IPV6))
+ e.addr.u.ip6 = mp->addr.dst.ip6;
#endif
+ else
+ ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr);
e.addr.proto = mp->addr.proto;
nest_ent = nla_nest_start_noflag(skb,
MDBA_MDB_ENTRY_INFO);
@@ -115,19 +190,55 @@ static int __mdb_fill_info(struct sk_buff *skb,
if (nla_put_nohdr(skb, sizeof(e), &e) ||
nla_put_u32(skb,
MDBA_MDB_EATTR_TIMER,
- br_timer_value(mtimer))) {
- nla_nest_cancel(skb, nest_ent);
- return -EMSGSIZE;
+ br_timer_value(mtimer)))
+ goto nest_err;
+
+ switch (mp->addr.proto) {
+ case htons(ETH_P_IP):
+ dump_srcs_mode = !!(mp->br->multicast_igmp_version == 3);
+ if (mp->addr.src.ip4) {
+ if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE,
+ mp->addr.src.ip4))
+ goto nest_err;
+ break;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ dump_srcs_mode = !!(mp->br->multicast_mld_version == 2);
+ if (!ipv6_addr_any(&mp->addr.src.ip6)) {
+ if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE,
+ &mp->addr.src.ip6))
+ goto nest_err;
+ break;
+ }
+ break;
+#endif
+ default:
+ ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr);
+ }
+ if (p) {
+ if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol))
+ goto nest_err;
+ if (dump_srcs_mode &&
+ (__mdb_fill_srcs(skb, p) ||
+ nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE,
+ p->filter_mode)))
+ goto nest_err;
}
nla_nest_end(skb, nest_ent);
return 0;
+
+nest_err:
+ nla_nest_cancel(skb, nest_ent);
+ return -EMSGSIZE;
}
static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev)
{
- int idx = 0, s_idx = cb->args[1], err = 0;
+ int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2];
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_mdb_entry *mp;
struct nlattr *nest, *nest2;
@@ -152,7 +263,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
break;
}
- if (mp->host_joined) {
+ if (!s_pidx && mp->host_joined) {
err = __mdb_fill_info(skb, mp, NULL);
if (err) {
nla_nest_cancel(skb, nest2);
@@ -162,15 +273,21 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
pp = &p->next) {
- if (!p->port)
+ if (!p->key.port)
continue;
+ if (pidx < s_pidx)
+ goto skip_pg;
err = __mdb_fill_info(skb, mp, p);
if (err) {
- nla_nest_cancel(skb, nest2);
+ nla_nest_end(skb, nest2);
goto out;
}
+skip_pg:
+ pidx++;
}
+ pidx = 0;
+ s_pidx = 0;
nla_nest_end(skb, nest2);
skip:
idx++;
@@ -178,6 +295,7 @@ skip:
out:
cb->args[1] = idx;
+ cb->args[2] = pidx;
nla_nest_end(skb, nest);
return err;
}
@@ -263,14 +381,15 @@ out:
static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
struct net_device *dev,
- struct br_mdb_entry *entry, u32 pid,
- u32 seq, int type, unsigned int flags)
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
{
struct nlmsghdr *nlh;
struct br_port_msg *bpm;
struct nlattr *nest, *nest2;
- nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
+ nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0);
if (!nlh)
return -EMSGSIZE;
@@ -285,7 +404,7 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
if (nest2 == NULL)
goto end;
- if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(*entry), entry))
+ if (__mdb_fill_info(skb, mp, pg))
goto end;
nla_nest_end(skb, nest2);
@@ -300,10 +419,58 @@ cancel:
return -EMSGSIZE;
}
-static inline size_t rtnl_mdb_nlmsg_size(void)
+static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg)
{
- return NLMSG_ALIGN(sizeof(struct br_port_msg))
- + nla_total_size(sizeof(struct br_mdb_entry));
+ size_t nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) +
+ nla_total_size(sizeof(struct br_mdb_entry)) +
+ nla_total_size(sizeof(u32));
+ struct net_bridge_group_src *ent;
+ size_t addr_size = 0;
+
+ if (!pg)
+ goto out;
+
+ /* MDBA_MDB_EATTR_RTPROT */
+ nlmsg_size += nla_total_size(sizeof(u8));
+
+ switch (pg->key.addr.proto) {
+ case htons(ETH_P_IP):
+ /* MDBA_MDB_EATTR_SOURCE */
+ if (pg->key.addr.src.ip4)
+ nlmsg_size += nla_total_size(sizeof(__be32));
+ if (pg->key.port->br->multicast_igmp_version == 2)
+ goto out;
+ addr_size = sizeof(__be32);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ /* MDBA_MDB_EATTR_SOURCE */
+ if (!ipv6_addr_any(&pg->key.addr.src.ip6))
+ nlmsg_size += nla_total_size(sizeof(struct in6_addr));
+ if (pg->key.port->br->multicast_mld_version == 1)
+ goto out;
+ addr_size = sizeof(struct in6_addr);
+ break;
+#endif
+ }
+
+ /* MDBA_MDB_EATTR_GROUP_MODE */
+ nlmsg_size += nla_total_size(sizeof(u8));
+
+ /* MDBA_MDB_EATTR_SRC_LIST nested attr */
+ if (!hlist_empty(&pg->src_list))
+ nlmsg_size += nla_total_size(0);
+
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ /* MDBA_MDB_SRCLIST_ENTRY nested attr +
+ * MDBA_MDB_SRCATTR_ADDRESS + MDBA_MDB_SRCATTR_TIMER
+ */
+ nlmsg_size += nla_total_size(0) +
+ nla_total_size(addr_size) +
+ nla_total_size(sizeof(u32));
+ }
+out:
+ return nlmsg_size;
}
struct br_mdb_complete_info {
@@ -329,7 +496,7 @@ static void br_mdb_complete(struct net_device *dev, int err, void *priv)
goto out;
for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
- if (p->port != port)
+ if (p->key.port != port)
continue;
p->flags |= MDB_PG_FLAGS_OFFLOAD;
}
@@ -341,21 +508,22 @@ err:
static void br_mdb_switchdev_host_port(struct net_device *dev,
struct net_device *lower_dev,
- struct br_mdb_entry *entry, int type)
+ struct net_bridge_mdb_entry *mp,
+ int type)
{
struct switchdev_obj_port_mdb mdb = {
.obj = {
.id = SWITCHDEV_OBJ_ID_HOST_MDB,
.flags = SWITCHDEV_F_DEFER,
},
- .vid = entry->vid,
+ .vid = mp->addr.vid,
};
- if (entry->addr.proto == htons(ETH_P_IP))
- ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ if (mp->addr.proto == htons(ETH_P_IP))
+ ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr);
#if IS_ENABLED(CONFIG_IPV6)
else
- ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr);
#endif
mdb.obj.orig_dev = dev;
@@ -370,17 +538,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
}
static void br_mdb_switchdev_host(struct net_device *dev,
- struct br_mdb_entry *entry, int type)
+ struct net_bridge_mdb_entry *mp, int type)
{
struct net_device *lower_dev;
struct list_head *iter;
netdev_for_each_lower_dev(dev, lower_dev, iter)
- br_mdb_switchdev_host_port(dev, lower_dev, entry, type);
+ br_mdb_switchdev_host_port(dev, lower_dev, mp, type);
}
-static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
- struct br_mdb_entry *entry, int type)
+void br_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
{
struct br_mdb_complete_info *complete_info;
struct switchdev_obj_port_mdb mdb = {
@@ -388,44 +558,48 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
.id = SWITCHDEV_OBJ_ID_PORT_MDB,
.flags = SWITCHDEV_F_DEFER,
},
- .vid = entry->vid,
+ .vid = mp->addr.vid,
};
- struct net_device *port_dev;
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
- port_dev = __dev_get_by_index(net, entry->ifindex);
- if (entry->addr.proto == htons(ETH_P_IP))
- ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ if (pg) {
+ if (mp->addr.proto == htons(ETH_P_IP))
+ ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr);
#if IS_ENABLED(CONFIG_IPV6)
- else
- ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ else if (mp->addr.proto == htons(ETH_P_IPV6))
+ ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr);
#endif
-
- mdb.obj.orig_dev = port_dev;
- if (p && port_dev && type == RTM_NEWMDB) {
- complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC);
- if (complete_info) {
- complete_info->port = p;
- __mdb_entry_to_br_ip(entry, &complete_info->ip);
+ else
+ ether_addr_copy(mdb.addr, mp->addr.dst.mac_addr);
+
+ mdb.obj.orig_dev = pg->key.port->dev;
+ switch (type) {
+ case RTM_NEWMDB:
+ complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC);
+ if (!complete_info)
+ break;
+ complete_info->port = pg->key.port;
+ complete_info->ip = mp->addr;
mdb.obj.complete_priv = complete_info;
mdb.obj.complete = br_mdb_complete;
- if (switchdev_port_obj_add(port_dev, &mdb.obj, NULL))
+ if (switchdev_port_obj_add(pg->key.port->dev, &mdb.obj, NULL))
kfree(complete_info);
+ break;
+ case RTM_DELMDB:
+ switchdev_port_obj_del(pg->key.port->dev, &mdb.obj);
+ break;
}
- } else if (p && port_dev && type == RTM_DELMDB) {
- switchdev_port_obj_del(port_dev, &mdb.obj);
+ } else {
+ br_mdb_switchdev_host(dev, mp, type);
}
- if (!p)
- br_mdb_switchdev_host(dev, entry, type);
-
- skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC);
+ skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC);
if (!skb)
goto errout;
- err = nlmsg_populate_mdb_fill(skb, dev, entry, 0, 0, type, NTF_SELF);
+ err = nlmsg_populate_mdb_fill(skb, dev, mp, pg, type);
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -437,26 +611,6 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_MDB, err);
}
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type, u8 flags)
-{
- struct br_mdb_entry entry;
-
- memset(&entry, 0, sizeof(entry));
- if (port)
- entry.ifindex = port->dev->ifindex;
- else
- entry.ifindex = dev->ifindex;
- entry.addr.proto = group->proto;
- entry.addr.u.ip4 = group->u.ip4;
-#if IS_ENABLED(CONFIG_IPV6)
- entry.addr.u.ip6 = group->u.ip6;
-#endif
- entry.vid = group->vid;
- __mdb_entry_fill_flags(&entry, flags);
- __br_mdb_notify(dev, port, &entry, type);
-}
-
static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
struct net_device *dev,
int ifindex, u32 pid,
@@ -524,33 +678,100 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_MDB, err);
}
-static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
+static bool is_valid_mdb_entry(struct br_mdb_entry *entry,
+ struct netlink_ext_ack *extack)
{
- if (entry->ifindex == 0)
+ if (entry->ifindex == 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Zero entry ifindex is not allowed");
return false;
+ }
if (entry->addr.proto == htons(ETH_P_IP)) {
- if (!ipv4_is_multicast(entry->addr.u.ip4))
+ if (!ipv4_is_multicast(entry->addr.u.ip4)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address is not multicast");
return false;
- if (ipv4_is_local_multicast(entry->addr.u.ip4))
+ }
+ if (ipv4_is_local_multicast(entry->addr.u.ip4)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address is local multicast");
return false;
+ }
#if IS_ENABLED(CONFIG_IPV6)
} else if (entry->addr.proto == htons(ETH_P_IPV6)) {
- if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
+ if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 entry group address is link-local all nodes");
return false;
+ }
#endif
- } else
+ } else if (entry->addr.proto == 0) {
+ /* L2 mdb */
+ if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) {
+ NL_SET_ERR_MSG_MOD(extack, "L2 entry group is not multicast");
+ return false;
+ }
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "Unknown entry protocol");
return false;
- if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
+ }
+
+ if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
+ NL_SET_ERR_MSG_MOD(extack, "Unknown entry state");
return false;
- if (entry->vid >= VLAN_VID_MASK)
+ }
+ if (entry->vid >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid entry VLAN id");
return false;
+ }
return true;
}
+static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto,
+ struct netlink_ext_ack *extack)
+{
+ switch (proto) {
+ case htons(ETH_P_IP):
+ if (nla_len(attr) != sizeof(struct in_addr)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length");
+ return false;
+ }
+ if (ipv4_is_multicast(nla_get_in_addr(attr))) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed");
+ return false;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6): {
+ struct in6_addr src;
+
+ if (nla_len(attr) != sizeof(struct in6_addr)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length");
+ return false;
+ }
+ src = nla_get_in6_addr(attr);
+ if (ipv6_addr_is_multicast(&src)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed");
+ return false;
+ }
+ break;
+ }
+#endif
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address");
+ return false;
+ }
+
+ return true;
+}
+
+static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = {
+ [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY,
+ sizeof(struct in_addr),
+ sizeof(struct in6_addr)),
+};
+
static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct net_device **pdev, struct br_mdb_entry **pentry)
+ struct net_device **pdev, struct br_mdb_entry **pentry,
+ struct nlattr **mdb_attrs, struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
struct br_mdb_entry *entry;
@@ -566,51 +787,92 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
bpm = nlmsg_data(nlh);
if (bpm->ifindex == 0) {
- pr_info("PF_BRIDGE: br_mdb_parse() with invalid ifindex\n");
+ NL_SET_ERR_MSG_MOD(extack, "Invalid bridge ifindex");
return -EINVAL;
}
dev = __dev_get_by_index(net, bpm->ifindex);
if (dev == NULL) {
- pr_info("PF_BRIDGE: br_mdb_parse() with unknown ifindex\n");
+ NL_SET_ERR_MSG_MOD(extack, "Bridge device doesn't exist");
return -ENODEV;
}
if (!(dev->priv_flags & IFF_EBRIDGE)) {
- pr_info("PF_BRIDGE: br_mdb_parse() with non-bridge\n");
+ NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge");
return -EOPNOTSUPP;
}
*pdev = dev;
- if (!tb[MDBA_SET_ENTRY] ||
- nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) {
- pr_info("PF_BRIDGE: br_mdb_parse() with invalid attr\n");
+ if (!tb[MDBA_SET_ENTRY]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY attribute");
+ return -EINVAL;
+ }
+ if (nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length");
return -EINVAL;
}
entry = nla_data(tb[MDBA_SET_ENTRY]);
- if (!is_valid_mdb_entry(entry)) {
- pr_info("PF_BRIDGE: br_mdb_parse() with invalid entry\n");
+ if (!is_valid_mdb_entry(entry, extack))
return -EINVAL;
+ *pentry = entry;
+
+ if (tb[MDBA_SET_ENTRY_ATTRS]) {
+ err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX,
+ tb[MDBA_SET_ENTRY_ATTRS],
+ br_mdbe_attrs_pol, extack);
+ if (err)
+ return err;
+ if (mdb_attrs[MDBE_ATTR_SOURCE] &&
+ !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE],
+ entry->addr.proto, extack))
+ return -EINVAL;
+ } else {
+ memset(mdb_attrs, 0,
+ sizeof(struct nlattr *) * (MDBE_ATTR_MAX + 1));
}
- *pentry = entry;
return 0;
}
static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
- struct br_ip *group, unsigned char state)
+ struct br_mdb_entry *entry,
+ struct nlattr **mdb_attrs,
+ struct netlink_ext_ack *extack)
{
- struct net_bridge_mdb_entry *mp;
+ struct net_bridge_mdb_entry *mp, *star_mp;
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
+ struct br_ip group, star_group;
unsigned long now = jiffies;
+ unsigned char flags = 0;
+ u8 filter_mode;
int err;
- mp = br_mdb_ip_get(br, group);
+ __mdb_entry_to_br_ip(entry, &group, mdb_attrs);
+
+ /* host join errors which can happen before creating the group */
+ if (!port) {
+ /* don't allow any flags for host-joined groups */
+ if (entry->state) {
+ NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups");
+ return -EINVAL;
+ }
+ if (!br_multicast_is_star_g(&group)) {
+ NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined");
+ return -EINVAL;
+ }
+ }
+
+ if (br_group_is_l2(&group) && entry->state != MDB_PERMANENT) {
+ NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed");
+ return -EINVAL;
+ }
+
+ mp = br_mdb_ip_get(br, &group);
if (!mp) {
- mp = br_multicast_new_group(br, group);
+ mp = br_multicast_new_group(br, &group);
err = PTR_ERR_OR_ZERO(mp);
if (err)
return err;
@@ -618,13 +880,13 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
/* host join */
if (!port) {
- /* don't allow any flags for host-joined groups */
- if (state)
- return -EINVAL;
- if (mp->host_joined)
+ if (mp->host_joined) {
+ NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host");
return -EEXIST;
+ }
br_multicast_host_join(mp, false);
+ br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB);
return 0;
}
@@ -632,54 +894,72 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
- if (p->port == port)
+ if (p->key.port == port) {
+ NL_SET_ERR_MSG_MOD(extack, "Group is already joined by port");
return -EEXIST;
- if ((unsigned long)p->port < (unsigned long)port)
+ }
+ if ((unsigned long)p->key.port < (unsigned long)port)
break;
}
- p = br_multicast_new_port_group(port, group, *pp, state, NULL);
- if (unlikely(!p))
+ filter_mode = br_multicast_is_star_g(&group) ? MCAST_EXCLUDE :
+ MCAST_INCLUDE;
+
+ if (entry->state == MDB_PERMANENT)
+ flags |= MDB_PG_FLAGS_PERMANENT;
+
+ p = br_multicast_new_port_group(port, &group, *pp, flags, NULL,
+ filter_mode, RTPROT_STATIC);
+ if (unlikely(!p)) {
+ NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group");
return -ENOMEM;
+ }
rcu_assign_pointer(*pp, p);
- if (state == MDB_TEMPORARY)
+ if (entry->state == MDB_TEMPORARY)
mod_timer(&p->timer, now + br->multicast_membership_interval);
+ br_mdb_notify(br->dev, mp, p, RTM_NEWMDB);
+ /* if we are adding a new EXCLUDE port group (*,G) it needs to be also
+ * added to all S,G entries for proper replication, if we are adding
+ * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be
+ * added to it for proper replication
+ */
+ if (br_multicast_should_handle_mode(br, group.proto)) {
+ switch (filter_mode) {
+ case MCAST_EXCLUDE:
+ br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE);
+ break;
+ case MCAST_INCLUDE:
+ star_group = p->key.addr;
+ memset(&star_group.src, 0, sizeof(star_group.src));
+ star_mp = br_mdb_ip_get(br, &star_group);
+ if (star_mp)
+ br_multicast_sg_add_exclude_ports(star_mp, p);
+ break;
+ }
+ }
return 0;
}
static int __br_mdb_add(struct net *net, struct net_bridge *br,
- struct br_mdb_entry *entry)
+ struct net_bridge_port *p,
+ struct br_mdb_entry *entry,
+ struct nlattr **mdb_attrs,
+ struct netlink_ext_ack *extack)
{
- struct br_ip ip;
- struct net_device *dev;
- struct net_bridge_port *p = NULL;
int ret;
- if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
- return -EINVAL;
-
- if (entry->ifindex != br->dev->ifindex) {
- dev = __dev_get_by_index(net, entry->ifindex);
- if (!dev)
- return -ENODEV;
-
- p = br_port_get_rtnl(dev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
- return -EINVAL;
- }
-
- __mdb_entry_to_br_ip(entry, &ip);
-
spin_lock_bh(&br->multicast_lock);
- ret = br_mdb_add_group(br, p, &ip, entry->state);
+ ret = br_mdb_add_group(br, p, entry, mdb_attrs, extack);
spin_unlock_bh(&br->multicast_lock);
+
return ret;
}
static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1];
struct net *net = sock_net(skb->sk);
struct net_bridge_vlan_group *vg;
struct net_bridge_port *p = NULL;
@@ -689,20 +969,43 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net_bridge *br;
int err;
- err = br_mdb_parse(skb, nlh, &dev, &entry);
+ err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack);
if (err < 0)
return err;
br = netdev_priv(dev);
+ if (!netif_running(br->dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running");
+ return -EINVAL;
+ }
+
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) {
+ NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled");
+ return -EINVAL;
+ }
+
if (entry->ifindex != br->dev->ifindex) {
pdev = __dev_get_by_index(net, entry->ifindex);
- if (!pdev)
+ if (!pdev) {
+ NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist");
return -ENODEV;
+ }
p = br_port_get_rtnl(pdev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port");
+ return -EINVAL;
+ }
+
+ if (p->br != br) {
+ NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device");
return -EINVAL;
+ }
+ if (p->state == BR_STATE_DISABLED) {
+ NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state");
+ return -EINVAL;
+ }
vg = nbp_vlan_group(p);
} else {
vg = br_vlan_group(br);
@@ -714,21 +1017,19 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) {
list_for_each_entry(v, &vg->vlan_list, vlist) {
entry->vid = v->vid;
- err = __br_mdb_add(net, br, entry);
+ err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack);
if (err)
break;
- __br_mdb_notify(dev, p, entry, RTM_NEWMDB);
}
} else {
- err = __br_mdb_add(net, br, entry);
- if (!err)
- __br_mdb_notify(dev, p, entry, RTM_NEWMDB);
+ err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack);
}
return err;
}
-static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
+static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry,
+ struct nlattr **mdb_attrs)
{
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
@@ -739,7 +1040,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
return -EINVAL;
- __mdb_entry_to_br_ip(entry, &ip);
+ __mdb_entry_to_br_ip(entry, &ip, mdb_attrs);
spin_lock_bh(&br->multicast_lock);
mp = br_mdb_ip_get(br, &ip);
@@ -750,6 +1051,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) {
br_multicast_host_leave(mp, false);
err = 0;
+ br_mdb_notify(br->dev, mp, NULL, RTM_DELMDB);
if (!mp->ports && netif_running(br->dev))
mod_timer(&mp->timer, jiffies);
goto unlock;
@@ -758,22 +1060,14 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
- if (!p->port || p->port->dev->ifindex != entry->ifindex)
+ if (!p->key.port || p->key.port->dev->ifindex != entry->ifindex)
continue;
- if (p->port->state == BR_STATE_DISABLED)
+ if (p->key.port->state == BR_STATE_DISABLED)
goto unlock;
- __mdb_entry_fill_flags(entry, p->flags);
- rcu_assign_pointer(*pp, p->next);
- hlist_del_init(&p->mglist);
- del_timer(&p->timer);
- kfree_rcu(p, rcu);
+ br_multicast_del_pg(mp, p, pp);
err = 0;
-
- if (!mp->ports && !mp->host_joined &&
- netif_running(br->dev))
- mod_timer(&mp->timer, jiffies);
break;
}
@@ -785,6 +1079,7 @@ unlock:
static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1];
struct net *net = sock_net(skb->sk);
struct net_bridge_vlan_group *vg;
struct net_bridge_port *p = NULL;
@@ -794,7 +1089,7 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net_bridge *br;
int err;
- err = br_mdb_parse(skb, nlh, &dev, &entry);
+ err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack);
if (err < 0)
return err;
@@ -819,14 +1114,10 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) {
list_for_each_entry(v, &vg->vlan_list, vlist) {
entry->vid = v->vid;
- err = __br_mdb_del(br, entry);
- if (!err)
- __br_mdb_notify(dev, p, entry, RTM_DELMDB);
+ err = __br_mdb_del(br, entry, mdb_attrs);
}
} else {
- err = __br_mdb_del(br, entry);
- if (!err)
- __br_mdb_notify(dev, p, entry, RTM_DELMDB);
+ err = __br_mdb_del(br, entry, mdb_attrs);
}
return err;
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index b36689e6e7cb..5aeae6ad17b3 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -6,6 +6,13 @@
static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 };
static const u8 mrp_in_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x3 };
+static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb);
+
+static struct br_frame_type mrp_frame_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_MRP),
+ .frame_handler = br_mrp_process,
+};
+
static bool br_mrp_is_ring_port(struct net_bridge_port *p_port,
struct net_bridge_port *s_port,
struct net_bridge_port *port)
@@ -47,8 +54,8 @@ static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id)
struct br_mrp *res = NULL;
struct br_mrp *mrp;
- list_for_each_entry_rcu(mrp, &br->mrp_list, list,
- lockdep_rtnl_is_held()) {
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
if (mrp->ring_id == ring_id) {
res = mrp;
break;
@@ -63,8 +70,8 @@ static struct br_mrp *br_mrp_find_in_id(struct net_bridge *br, u32 in_id)
struct br_mrp *res = NULL;
struct br_mrp *mrp;
- list_for_each_entry_rcu(mrp, &br->mrp_list, list,
- lockdep_rtnl_is_held()) {
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
if (mrp->in_id == in_id) {
res = mrp;
break;
@@ -78,8 +85,8 @@ static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex)
{
struct br_mrp *mrp;
- list_for_each_entry_rcu(mrp, &br->mrp_list, list,
- lockdep_rtnl_is_held()) {
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
struct net_bridge_port *p;
p = rtnl_dereference(mrp->p_port);
@@ -104,8 +111,8 @@ static struct br_mrp *br_mrp_find_port(struct net_bridge *br,
struct br_mrp *res = NULL;
struct br_mrp *mrp;
- list_for_each_entry_rcu(mrp, &br->mrp_list, list,
- lockdep_rtnl_is_held()) {
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
if (rcu_access_pointer(mrp->p_port) == p ||
rcu_access_pointer(mrp->s_port) == p ||
rcu_access_pointer(mrp->i_port) == p) {
@@ -443,8 +450,11 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
rcu_assign_pointer(mrp->i_port, NULL);
}
- list_del_rcu(&mrp->list);
+ hlist_del_rcu(&mrp->list);
kfree_rcu(mrp, rcu);
+
+ if (hlist_empty(&br->mrp_list))
+ br_del_frame(br, &mrp_frame_type);
}
/* Adds a new MRP instance.
@@ -493,9 +503,12 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance)
spin_unlock_bh(&br->lock);
rcu_assign_pointer(mrp->s_port, p);
+ if (hlist_empty(&br->mrp_list))
+ br_add_frame(br, &mrp_frame_type);
+
INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired);
INIT_DELAYED_WORK(&mrp->in_test_work, br_mrp_in_test_work_expired);
- list_add_tail_rcu(&mrp->list, &br->mrp_list);
+ hlist_add_tail_rcu(&mrp->list, &br->mrp_list);
err = br_mrp_switchdev_add(br, mrp);
if (err)
@@ -544,19 +557,22 @@ int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance)
int br_mrp_set_port_state(struct net_bridge_port *p,
enum br_mrp_port_state_type state)
{
+ u32 port_state;
+
if (!p || !(p->flags & BR_MRP_AWARE))
return -EINVAL;
spin_lock_bh(&p->br->lock);
if (state == BR_MRP_PORT_STATE_FORWARDING)
- p->state = BR_STATE_FORWARDING;
+ port_state = BR_STATE_FORWARDING;
else
- p->state = BR_STATE_BLOCKING;
+ port_state = BR_STATE_BLOCKING;
+ p->state = port_state;
spin_unlock_bh(&p->br->lock);
- br_mrp_port_switchdev_set_state(p, state);
+ br_mrp_port_switchdev_set_state(p, port_state);
return 0;
}
@@ -845,7 +861,8 @@ static bool br_mrp_in_frame(struct sk_buff *skb)
if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST ||
hdr->type == BR_MRP_TLV_HEADER_IN_TOPO ||
hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN ||
- hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP)
+ hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP ||
+ hdr->type == BR_MRP_TLV_HEADER_IN_LINK_STATUS)
return true;
return false;
@@ -1113,9 +1130,9 @@ static int br_mrp_rcv(struct net_bridge_port *p,
goto no_forward;
}
} else {
- /* MIM should forward IntLinkChange and
+ /* MIM should forward IntLinkChange/Status and
* IntTopoChange between ring ports but MIM
- * should not forward IntLinkChange and
+ * should not forward IntLinkChange/Status and
* IntTopoChange if the frame was received at
* the interconnect port
*/
@@ -1142,6 +1159,17 @@ static int br_mrp_rcv(struct net_bridge_port *p,
in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN))
goto forward;
+ /* MIC should forward IntLinkStatus frames only to
+ * interconnect port if it was received on a ring port.
+ * If it is received on interconnect port then, it
+ * should be forward on both ring ports
+ */
+ if (br_mrp_is_ring_port(p_port, s_port, p) &&
+ in_type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) {
+ p_dst = NULL;
+ s_dst = NULL;
+ }
+
/* Should forward the InTopo frames only between the
* ring ports
*/
@@ -1172,20 +1200,18 @@ no_forward:
* normal forwarding.
* note: already called with rcu_read_lock
*/
-int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb)
+static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb)
{
/* If there is no MRP instance do normal forwarding */
if (likely(!(p->flags & BR_MRP_AWARE)))
goto out;
- if (unlikely(skb->protocol == htons(ETH_P_MRP)))
- return br_mrp_rcv(p, skb, p->dev);
-
+ return br_mrp_rcv(p, skb, p->dev);
out:
return 0;
}
bool br_mrp_enabled(struct net_bridge *br)
{
- return !list_empty(&br->mrp_list);
+ return !hlist_empty(&br->mrp_list);
}
diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c
index 2a2fdf3500c5..ce6f63c77cc0 100644
--- a/net/bridge/br_mrp_netlink.c
+++ b/net/bridge/br_mrp_netlink.c
@@ -453,7 +453,7 @@ int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br)
if (!mrp_tb)
return -EMSGSIZE;
- list_for_each_entry_rcu(mrp, &br->mrp_list, list) {
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list) {
struct net_bridge_port *p;
tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP_INFO);
diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c
index ed547e03ace1..75a7e8d0a268 100644
--- a/net/bridge/br_mrp_switchdev.c
+++ b/net/bridge/br_mrp_switchdev.c
@@ -169,13 +169,12 @@ int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp,
return err;
}
-int br_mrp_port_switchdev_set_state(struct net_bridge_port *p,
- enum br_mrp_port_state_type state)
+int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state)
{
struct switchdev_attr attr = {
.orig_dev = p->dev,
- .id = SWITCHDEV_ATTR_ID_MRP_PORT_STATE,
- .u.mrp_port_state = state,
+ .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
+ .u.stp_state = state,
};
int err;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 4c4a93abde68..257ac4e25f6d 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -41,6 +41,13 @@ static const struct rhashtable_params br_mdb_rht_params = {
.automatic_shrinking = true,
};
+static const struct rhashtable_params br_sg_port_rht_params = {
+ .head_offset = offsetof(struct net_bridge_port_group, rhnode),
+ .key_offset = offsetof(struct net_bridge_port_group, key),
+ .key_len = sizeof(struct net_bridge_port_group_sg_key),
+ .automatic_shrinking = true,
+};
+
static void br_multicast_start_querier(struct net_bridge *br,
struct bridge_mcast_own_query *query);
static void br_multicast_add_router(struct net_bridge *br,
@@ -50,6 +57,7 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
__be32 group,
__u16 vid,
const unsigned char *src);
+static void br_multicast_port_group_rexmit(struct timer_list *t);
static void __del_port_router(struct net_bridge_port *p);
#if IS_ENABLED(CONFIG_IPV6)
@@ -58,6 +66,26 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
const struct in6_addr *group,
__u16 vid, const unsigned char *src);
#endif
+static struct net_bridge_port_group *
+__br_multicast_add_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct br_ip *group,
+ const unsigned char *src,
+ u8 filter_mode,
+ bool igmpv2_mldv1,
+ bool blocked);
+static void br_multicast_find_del_pg(struct net_bridge *br,
+ struct net_bridge_port_group *pg);
+
+static struct net_bridge_port_group *
+br_sg_port_find(struct net_bridge *br,
+ struct net_bridge_port_group_sg_key *sg_p)
+{
+ lockdep_assert_held_once(&br->multicast_lock);
+
+ return rhashtable_lookup_fast(&br->sg_port_tbl, sg_p,
+ br_sg_port_rht_params);
+}
static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br,
struct br_ip *dst)
@@ -85,7 +113,7 @@ static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br,
struct br_ip br_dst;
memset(&br_dst, 0, sizeof(br_dst));
- br_dst.u.ip4 = dst;
+ br_dst.dst.ip4 = dst;
br_dst.proto = htons(ETH_P_IP);
br_dst.vid = vid;
@@ -100,7 +128,7 @@ static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br,
struct br_ip br_dst;
memset(&br_dst, 0, sizeof(br_dst));
- br_dst.u.ip6 = *dst;
+ br_dst.dst.ip6 = *dst;
br_dst.proto = htons(ETH_P_IPV6);
br_dst.vid = vid;
@@ -125,52 +153,471 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
switch (skb->protocol) {
case htons(ETH_P_IP):
- ip.u.ip4 = ip_hdr(skb)->daddr;
+ ip.dst.ip4 = ip_hdr(skb)->daddr;
+ if (br->multicast_igmp_version == 3) {
+ struct net_bridge_mdb_entry *mdb;
+
+ ip.src.ip4 = ip_hdr(skb)->saddr;
+ mdb = br_mdb_ip_get_rcu(br, &ip);
+ if (mdb)
+ return mdb;
+ ip.src.ip4 = 0;
+ }
break;
#if IS_ENABLED(CONFIG_IPV6)
case htons(ETH_P_IPV6):
- ip.u.ip6 = ipv6_hdr(skb)->daddr;
+ ip.dst.ip6 = ipv6_hdr(skb)->daddr;
+ if (br->multicast_mld_version == 2) {
+ struct net_bridge_mdb_entry *mdb;
+
+ ip.src.ip6 = ipv6_hdr(skb)->saddr;
+ mdb = br_mdb_ip_get_rcu(br, &ip);
+ if (mdb)
+ return mdb;
+ memset(&ip.src.ip6, 0, sizeof(ip.src.ip6));
+ }
break;
#endif
default:
- return NULL;
+ ip.proto = 0;
+ ether_addr_copy(ip.dst.mac_addr, eth_hdr(skb)->h_dest);
}
return br_mdb_ip_get_rcu(br, &ip);
}
+static bool br_port_group_equal(struct net_bridge_port_group *p,
+ struct net_bridge_port *port,
+ const unsigned char *src)
+{
+ if (p->key.port != port)
+ return false;
+
+ if (!(port->flags & BR_MULTICAST_TO_UNICAST))
+ return true;
+
+ return ether_addr_equal(src, p->eth_addr);
+}
+
+static void __fwd_add_star_excl(struct net_bridge_port_group *pg,
+ struct br_ip *sg_ip)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_port_group *src_pg;
+
+ memset(&sg_key, 0, sizeof(sg_key));
+ sg_key.port = pg->key.port;
+ sg_key.addr = *sg_ip;
+ if (br_sg_port_find(br, &sg_key))
+ return;
+
+ src_pg = __br_multicast_add_group(br, pg->key.port, sg_ip, pg->eth_addr,
+ MCAST_INCLUDE, false, false);
+ if (IS_ERR_OR_NULL(src_pg) ||
+ src_pg->rt_protocol != RTPROT_KERNEL)
+ return;
+
+ src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL;
+}
+
+static void __fwd_del_star_excl(struct net_bridge_port_group *pg,
+ struct br_ip *sg_ip)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_port_group *src_pg;
+
+ memset(&sg_key, 0, sizeof(sg_key));
+ sg_key.port = pg->key.port;
+ sg_key.addr = *sg_ip;
+ src_pg = br_sg_port_find(br, &sg_key);
+ if (!src_pg || !(src_pg->flags & MDB_PG_FLAGS_STAR_EXCL) ||
+ src_pg->rt_protocol != RTPROT_KERNEL)
+ return;
+
+ br_multicast_find_del_pg(br, src_pg);
+}
+
+/* When a port group transitions to (or is added as) EXCLUDE we need to add it
+ * to all other ports' S,G entries which are not blocked by the current group
+ * for proper replication, the assumption is that any S,G blocked entries
+ * are already added so the S,G,port lookup should skip them.
+ * When a port group transitions from EXCLUDE -> INCLUDE mode or is being
+ * deleted we need to remove it from all ports' S,G entries where it was
+ * automatically installed before (i.e. where it's MDB_PG_FLAGS_STAR_EXCL).
+ */
+void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg,
+ u8 filter_mode)
+{
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_port_group *pg_lst;
+ struct net_bridge_mdb_entry *mp;
+ struct br_ip sg_ip;
+
+ if (WARN_ON(!br_multicast_is_star_g(&pg->key.addr)))
+ return;
+
+ mp = br_mdb_ip_get(br, &pg->key.addr);
+ if (!mp)
+ return;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ sg_ip = pg->key.addr;
+ for (pg_lst = mlock_dereference(mp->ports, br);
+ pg_lst;
+ pg_lst = mlock_dereference(pg_lst->next, br)) {
+ struct net_bridge_group_src *src_ent;
+
+ if (pg_lst == pg)
+ continue;
+ hlist_for_each_entry(src_ent, &pg_lst->src_list, node) {
+ if (!(src_ent->flags & BR_SGRP_F_INSTALLED))
+ continue;
+ sg_ip.src = src_ent->addr.src;
+ switch (filter_mode) {
+ case MCAST_INCLUDE:
+ __fwd_del_star_excl(pg, &sg_ip);
+ break;
+ case MCAST_EXCLUDE:
+ __fwd_add_star_excl(pg, &sg_ip);
+ break;
+ }
+ }
+ }
+}
+
+/* called when adding a new S,G with host_joined == false by default */
+static void br_multicast_sg_host_state(struct net_bridge_mdb_entry *star_mp,
+ struct net_bridge_port_group *sg)
+{
+ struct net_bridge_mdb_entry *sg_mp;
+
+ if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr)))
+ return;
+ if (!star_mp->host_joined)
+ return;
+
+ sg_mp = br_mdb_ip_get(star_mp->br, &sg->key.addr);
+ if (!sg_mp)
+ return;
+ sg_mp->host_joined = true;
+}
+
+/* set the host_joined state of all of *,G's S,G entries */
+static void br_multicast_star_g_host_state(struct net_bridge_mdb_entry *star_mp)
+{
+ struct net_bridge *br = star_mp->br;
+ struct net_bridge_mdb_entry *sg_mp;
+ struct net_bridge_port_group *pg;
+ struct br_ip sg_ip;
+
+ if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr)))
+ return;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ sg_ip = star_mp->addr;
+ for (pg = mlock_dereference(star_mp->ports, br);
+ pg;
+ pg = mlock_dereference(pg->next, br)) {
+ struct net_bridge_group_src *src_ent;
+
+ hlist_for_each_entry(src_ent, &pg->src_list, node) {
+ if (!(src_ent->flags & BR_SGRP_F_INSTALLED))
+ continue;
+ sg_ip.src = src_ent->addr.src;
+ sg_mp = br_mdb_ip_get(br, &sg_ip);
+ if (!sg_mp)
+ continue;
+ sg_mp->host_joined = star_mp->host_joined;
+ }
+ }
+}
+
+static void br_multicast_sg_del_exclude_ports(struct net_bridge_mdb_entry *sgmp)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+
+ /* *,G exclude ports are only added to S,G entries */
+ if (WARN_ON(br_multicast_is_star_g(&sgmp->addr)))
+ return;
+
+ /* we need the STAR_EXCLUDE ports if there are non-STAR_EXCLUDE ports
+ * we should ignore perm entries since they're managed by user-space
+ */
+ for (pp = &sgmp->ports;
+ (p = mlock_dereference(*pp, sgmp->br)) != NULL;
+ pp = &p->next)
+ if (!(p->flags & (MDB_PG_FLAGS_STAR_EXCL |
+ MDB_PG_FLAGS_PERMANENT)))
+ return;
+
+ /* currently the host can only have joined the *,G which means
+ * we treat it as EXCLUDE {}, so for an S,G it's considered a
+ * STAR_EXCLUDE entry and we can safely leave it
+ */
+ sgmp->host_joined = false;
+
+ for (pp = &sgmp->ports;
+ (p = mlock_dereference(*pp, sgmp->br)) != NULL;) {
+ if (!(p->flags & MDB_PG_FLAGS_PERMANENT))
+ br_multicast_del_pg(sgmp, p, pp);
+ else
+ pp = &p->next;
+ }
+}
+
+void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp,
+ struct net_bridge_port_group *sg)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge *br = star_mp->br;
+ struct net_bridge_port_group *pg;
+
+ if (WARN_ON(br_multicast_is_star_g(&sg->key.addr)))
+ return;
+ if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr)))
+ return;
+
+ br_multicast_sg_host_state(star_mp, sg);
+ memset(&sg_key, 0, sizeof(sg_key));
+ sg_key.addr = sg->key.addr;
+ /* we need to add all exclude ports to the S,G */
+ for (pg = mlock_dereference(star_mp->ports, br);
+ pg;
+ pg = mlock_dereference(pg->next, br)) {
+ struct net_bridge_port_group *src_pg;
+
+ if (pg == sg || pg->filter_mode == MCAST_INCLUDE)
+ continue;
+
+ sg_key.port = pg->key.port;
+ if (br_sg_port_find(br, &sg_key))
+ continue;
+
+ src_pg = __br_multicast_add_group(br, pg->key.port,
+ &sg->key.addr,
+ sg->eth_addr,
+ MCAST_INCLUDE, false, false);
+ if (IS_ERR_OR_NULL(src_pg) ||
+ src_pg->rt_protocol != RTPROT_KERNEL)
+ continue;
+ src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL;
+ }
+}
+
+static void br_multicast_fwd_src_add(struct net_bridge_group_src *src)
+{
+ struct net_bridge_mdb_entry *star_mp;
+ struct net_bridge_port_group *sg;
+ struct br_ip sg_ip;
+
+ if (src->flags & BR_SGRP_F_INSTALLED)
+ return;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ sg_ip = src->pg->key.addr;
+ sg_ip.src = src->addr.src;
+ sg = __br_multicast_add_group(src->br, src->pg->key.port, &sg_ip,
+ src->pg->eth_addr, MCAST_INCLUDE, false,
+ !timer_pending(&src->timer));
+ if (IS_ERR_OR_NULL(sg))
+ return;
+ src->flags |= BR_SGRP_F_INSTALLED;
+ sg->flags &= ~MDB_PG_FLAGS_STAR_EXCL;
+
+ /* if it was added by user-space as perm we can skip next steps */
+ if (sg->rt_protocol != RTPROT_KERNEL &&
+ (sg->flags & MDB_PG_FLAGS_PERMANENT))
+ return;
+
+ /* the kernel is now responsible for removing this S,G */
+ del_timer(&sg->timer);
+ star_mp = br_mdb_ip_get(src->br, &src->pg->key.addr);
+ if (!star_mp)
+ return;
+
+ br_multicast_sg_add_exclude_ports(star_mp, sg);
+}
+
+static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src)
+{
+ struct net_bridge_port_group *p, *pg = src->pg;
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_mdb_entry *mp;
+ struct br_ip sg_ip;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ sg_ip = pg->key.addr;
+ sg_ip.src = src->addr.src;
+
+ mp = br_mdb_ip_get(src->br, &sg_ip);
+ if (!mp)
+ return;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, src->br)) != NULL;
+ pp = &p->next) {
+ if (!br_port_group_equal(p, pg->key.port, pg->eth_addr))
+ continue;
+
+ if (p->rt_protocol != RTPROT_KERNEL &&
+ (p->flags & MDB_PG_FLAGS_PERMANENT))
+ break;
+
+ br_multicast_del_pg(mp, p, pp);
+ break;
+ }
+ src->flags &= ~BR_SGRP_F_INSTALLED;
+}
+
+/* install S,G and based on src's timer enable or disable forwarding */
+static void br_multicast_fwd_src_handle(struct net_bridge_group_src *src)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge_port_group *sg;
+ u8 old_flags;
+
+ br_multicast_fwd_src_add(src);
+
+ memset(&sg_key, 0, sizeof(sg_key));
+ sg_key.addr = src->pg->key.addr;
+ sg_key.addr.src = src->addr.src;
+ sg_key.port = src->pg->key.port;
+
+ sg = br_sg_port_find(src->br, &sg_key);
+ if (!sg || (sg->flags & MDB_PG_FLAGS_PERMANENT))
+ return;
+
+ old_flags = sg->flags;
+ if (timer_pending(&src->timer))
+ sg->flags &= ~MDB_PG_FLAGS_BLOCKED;
+ else
+ sg->flags |= MDB_PG_FLAGS_BLOCKED;
+
+ if (old_flags != sg->flags) {
+ struct net_bridge_mdb_entry *sg_mp;
+
+ sg_mp = br_mdb_ip_get(src->br, &sg_key.addr);
+ if (!sg_mp)
+ return;
+ br_mdb_notify(src->br->dev, sg_mp, sg, RTM_NEWMDB);
+ }
+}
+
+static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_mdb_entry *mp;
+
+ mp = container_of(gc, struct net_bridge_mdb_entry, mcast_gc);
+ WARN_ON(!hlist_unhashed(&mp->mdb_node));
+ WARN_ON(mp->ports);
+
+ del_timer_sync(&mp->timer);
+ kfree_rcu(mp, rcu);
+}
+
+static void br_multicast_del_mdb_entry(struct net_bridge_mdb_entry *mp)
+{
+ struct net_bridge *br = mp->br;
+
+ rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode,
+ br_mdb_rht_params);
+ hlist_del_init_rcu(&mp->mdb_node);
+ hlist_add_head(&mp->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+}
+
static void br_multicast_group_expired(struct timer_list *t)
{
struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer);
struct net_bridge *br = mp->br;
spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) || timer_pending(&mp->timer))
+ if (hlist_unhashed(&mp->mdb_node) || !netif_running(br->dev) ||
+ timer_pending(&mp->timer))
goto out;
br_multicast_host_leave(mp, true);
if (mp->ports)
goto out;
+ br_multicast_del_mdb_entry(mp);
+out:
+ spin_unlock(&br->multicast_lock);
+}
- rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode,
- br_mdb_rht_params);
- hlist_del_rcu(&mp->mdb_node);
+static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_group_src *src;
- kfree_rcu(mp, rcu);
+ src = container_of(gc, struct net_bridge_group_src, mcast_gc);
+ WARN_ON(!hlist_unhashed(&src->node));
-out:
- spin_unlock(&br->multicast_lock);
+ del_timer_sync(&src->timer);
+ kfree_rcu(src, rcu);
+}
+
+static void br_multicast_del_group_src(struct net_bridge_group_src *src)
+{
+ struct net_bridge *br = src->pg->key.port->br;
+
+ br_multicast_fwd_src_remove(src);
+ hlist_del_init_rcu(&src->node);
+ src->pg->src_ents--;
+ hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+}
+
+static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_port_group *pg;
+
+ pg = container_of(gc, struct net_bridge_port_group, mcast_gc);
+ WARN_ON(!hlist_unhashed(&pg->mglist));
+ WARN_ON(!hlist_empty(&pg->src_list));
+
+ del_timer_sync(&pg->rexmit_timer);
+ del_timer_sync(&pg->timer);
+ kfree_rcu(pg, rcu);
}
-static void br_multicast_del_pg(struct net_bridge *br,
- struct net_bridge_port_group *pg)
+void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_port_group __rcu **pp)
{
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_group_src *ent;
+ struct hlist_node *tmp;
+
+ rcu_assign_pointer(*pp, pg->next);
+ hlist_del_init(&pg->mglist);
+ hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node)
+ br_multicast_del_group_src(ent);
+ br_mdb_notify(br->dev, mp, pg, RTM_DELMDB);
+ if (!br_multicast_is_star_g(&mp->addr)) {
+ rhashtable_remove_fast(&br->sg_port_tbl, &pg->rhnode,
+ br_sg_port_rht_params);
+ br_multicast_sg_del_exclude_ports(mp);
+ } else {
+ br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE);
+ }
+ hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+
+ if (!mp->ports && !mp->host_joined && netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+}
+
+static void br_multicast_find_del_pg(struct net_bridge *br,
+ struct net_bridge_port_group *pg)
+{
+ struct net_bridge_port_group __rcu **pp;
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
- struct net_bridge_port_group __rcu **pp;
- mp = br_mdb_ip_get(br, &pg->addr);
+ mp = br_mdb_ip_get(br, &pg->key.addr);
if (WARN_ON(!mp))
return;
@@ -180,17 +627,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
if (p != pg)
continue;
- rcu_assign_pointer(*pp, p->next);
- hlist_del_init(&p->mglist);
- del_timer(&p->timer);
- br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
- p->flags);
- kfree_rcu(p, rcu);
-
- if (!mp->ports && !mp->host_joined &&
- netif_running(br->dev))
- mod_timer(&mp->timer, jiffies);
-
+ br_multicast_del_pg(mp, pg, pp);
return;
}
@@ -200,35 +637,98 @@ static void br_multicast_del_pg(struct net_bridge *br,
static void br_multicast_port_group_expired(struct timer_list *t)
{
struct net_bridge_port_group *pg = from_timer(pg, t, timer);
- struct net_bridge *br = pg->port->br;
+ struct net_bridge_group_src *src_ent;
+ struct net_bridge *br = pg->key.port->br;
+ struct hlist_node *tmp;
+ bool changed;
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) || timer_pending(&pg->timer) ||
hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT)
goto out;
- br_multicast_del_pg(br, pg);
+ changed = !!(pg->filter_mode == MCAST_EXCLUDE);
+ pg->filter_mode = MCAST_INCLUDE;
+ hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) {
+ if (!timer_pending(&src_ent->timer)) {
+ br_multicast_del_group_src(src_ent);
+ changed = true;
+ }
+ }
+
+ if (hlist_empty(&pg->src_list)) {
+ br_multicast_find_del_pg(br, pg);
+ } else if (changed) {
+ struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->key.addr);
+ if (changed && br_multicast_is_star_g(&pg->key.addr))
+ br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE);
+
+ if (WARN_ON(!mp))
+ goto out;
+ br_mdb_notify(br->dev, mp, pg, RTM_NEWMDB);
+ }
out:
spin_unlock(&br->multicast_lock);
}
-static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
- __be32 group,
- u8 *igmp_type)
+static void br_multicast_gc(struct hlist_head *head)
{
+ struct net_bridge_mcast_gc *gcent;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(gcent, tmp, head, gc_node) {
+ hlist_del_init(&gcent->gc_node);
+ gcent->destroy(gcent);
+ }
+}
+
+static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
+ struct net_bridge_port_group *pg,
+ __be32 ip_dst, __be32 group,
+ bool with_srcs, bool over_lmqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
+{
+ struct net_bridge_port *p = pg ? pg->key.port : NULL;
+ struct net_bridge_group_src *ent;
+ size_t pkt_size, igmp_hdr_size;
+ unsigned long now = jiffies;
struct igmpv3_query *ihv3;
- size_t igmp_hdr_size;
+ void *csum_start = NULL;
+ __sum16 *csum = NULL;
struct sk_buff *skb;
struct igmphdr *ih;
struct ethhdr *eth;
+ unsigned long lmqt;
struct iphdr *iph;
+ u16 lmqt_srcs = 0;
igmp_hdr_size = sizeof(*ih);
- if (br->multicast_igmp_version == 3)
+ if (br->multicast_igmp_version == 3) {
igmp_hdr_size = sizeof(*ihv3);
- skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) +
- igmp_hdr_size + 4);
+ if (pg && with_srcs) {
+ lmqt = now + (br->multicast_last_member_interval *
+ br->multicast_last_member_count);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_lmqt == time_after(ent->timer.expires,
+ lmqt) &&
+ ent->src_query_rexmit_cnt > 0)
+ lmqt_srcs++;
+ }
+
+ if (!lmqt_srcs)
+ return NULL;
+ igmp_hdr_size += lmqt_srcs * sizeof(__be32);
+ }
+ }
+
+ pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size;
+ if ((p && pkt_size > p->dev->mtu) ||
+ pkt_size > br->dev->mtu)
+ return NULL;
+
+ skb = netdev_alloc_skb_ip_align(br->dev, pkt_size);
if (!skb)
goto out;
@@ -238,29 +738,24 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
eth = eth_hdr(skb);
ether_addr_copy(eth->h_source, br->dev->dev_addr);
- eth->h_dest[0] = 1;
- eth->h_dest[1] = 0;
- eth->h_dest[2] = 0x5e;
- eth->h_dest[3] = 0;
- eth->h_dest[4] = 0;
- eth->h_dest[5] = 1;
+ ip_eth_mc_map(ip_dst, eth->h_dest);
eth->h_proto = htons(ETH_P_IP);
skb_put(skb, sizeof(*eth));
skb_set_network_header(skb, skb->len);
iph = ip_hdr(skb);
+ iph->tot_len = htons(pkt_size - sizeof(*eth));
iph->version = 4;
iph->ihl = 6;
iph->tos = 0xc0;
- iph->tot_len = htons(sizeof(*iph) + igmp_hdr_size + 4);
iph->id = 0;
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
iph->protocol = IPPROTO_IGMP;
iph->saddr = br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR) ?
inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0;
- iph->daddr = htonl(INADDR_ALLHOSTS_GROUP);
+ iph->daddr = ip_dst;
((u8 *)&iph[1])[0] = IPOPT_RA;
((u8 *)&iph[1])[1] = 4;
((u8 *)&iph[1])[2] = 0;
@@ -280,7 +775,8 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
(HZ / IGMP_TIMER_SCALE);
ih->group = group;
ih->csum = 0;
- ih->csum = ip_compute_csum((void *)ih, sizeof(*ih));
+ csum = &ih->csum;
+ csum_start = (void *)ih;
break;
case 3:
ihv3 = igmpv3_query_hdr(skb);
@@ -290,15 +786,40 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
(HZ / IGMP_TIMER_SCALE);
ihv3->group = group;
ihv3->qqic = br->multicast_query_interval / HZ;
- ihv3->nsrcs = 0;
+ ihv3->nsrcs = htons(lmqt_srcs);
ihv3->resv = 0;
- ihv3->suppress = 0;
+ ihv3->suppress = sflag;
ihv3->qrv = 2;
ihv3->csum = 0;
- ihv3->csum = ip_compute_csum((void *)ihv3, sizeof(*ihv3));
+ csum = &ihv3->csum;
+ csum_start = (void *)ihv3;
+ if (!pg || !with_srcs)
+ break;
+
+ lmqt_srcs = 0;
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_lmqt == time_after(ent->timer.expires,
+ lmqt) &&
+ ent->src_query_rexmit_cnt > 0) {
+ ihv3->srcs[lmqt_srcs++] = ent->addr.src.ip4;
+ ent->src_query_rexmit_cnt--;
+ if (need_rexmit && ent->src_query_rexmit_cnt)
+ *need_rexmit = true;
+ }
+ }
+ if (WARN_ON(lmqt_srcs != ntohs(ihv3->nsrcs))) {
+ kfree_skb(skb);
+ return NULL;
+ }
break;
}
+ if (WARN_ON(!csum || !csum_start)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ *csum = ip_compute_csum(csum_start, igmp_hdr_size);
skb_put(skb, igmp_hdr_size);
__skb_pull(skb, sizeof(*eth));
@@ -308,23 +829,54 @@ out:
#if IS_ENABLED(CONFIG_IPV6)
static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
- const struct in6_addr *grp,
- u8 *igmp_type)
-{
+ struct net_bridge_port_group *pg,
+ const struct in6_addr *ip6_dst,
+ const struct in6_addr *group,
+ bool with_srcs, bool over_llqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
+{
+ struct net_bridge_port *p = pg ? pg->key.port : NULL;
+ struct net_bridge_group_src *ent;
+ size_t pkt_size, mld_hdr_size;
+ unsigned long now = jiffies;
struct mld2_query *mld2q;
+ void *csum_start = NULL;
unsigned long interval;
+ __sum16 *csum = NULL;
struct ipv6hdr *ip6h;
struct mld_msg *mldq;
- size_t mld_hdr_size;
struct sk_buff *skb;
+ unsigned long llqt;
struct ethhdr *eth;
+ u16 llqt_srcs = 0;
u8 *hopopt;
mld_hdr_size = sizeof(*mldq);
- if (br->multicast_mld_version == 2)
+ if (br->multicast_mld_version == 2) {
mld_hdr_size = sizeof(*mld2q);
- skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) +
- 8 + mld_hdr_size);
+ if (pg && with_srcs) {
+ llqt = now + (br->multicast_last_member_interval *
+ br->multicast_last_member_count);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_llqt == time_after(ent->timer.expires,
+ llqt) &&
+ ent->src_query_rexmit_cnt > 0)
+ llqt_srcs++;
+ }
+
+ if (!llqt_srcs)
+ return NULL;
+ mld_hdr_size += llqt_srcs * sizeof(struct in6_addr);
+ }
+ }
+
+ pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size;
+ if ((p && pkt_size > p->dev->mtu) ||
+ pkt_size > br->dev->mtu)
+ return NULL;
+
+ skb = netdev_alloc_skb_ip_align(br->dev, pkt_size);
if (!skb)
goto out;
@@ -346,7 +898,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
ip6h->payload_len = htons(8 + mld_hdr_size);
ip6h->nexthdr = IPPROTO_HOPOPTS;
ip6h->hop_limit = 1;
- ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
+ ip6h->daddr = *ip6_dst;
if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
&ip6h->saddr)) {
kfree_skb(skb);
@@ -371,7 +923,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
/* ICMPv6 */
skb_set_transport_header(skb, skb->len);
- interval = ipv6_addr_any(grp) ?
+ interval = ipv6_addr_any(group) ?
br->multicast_query_response_interval :
br->multicast_last_member_interval;
*igmp_type = ICMPV6_MGM_QUERY;
@@ -383,12 +935,9 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
mldq->mld_cksum = 0;
mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
mldq->mld_reserved = 0;
- mldq->mld_mca = *grp;
- mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- sizeof(*mldq), IPPROTO_ICMPV6,
- csum_partial(mldq,
- sizeof(*mldq),
- 0));
+ mldq->mld_mca = *group;
+ csum = &mldq->mld_cksum;
+ csum_start = (void *)mldq;
break;
case 2:
mld2q = (struct mld2_query *)icmp6_hdr(skb);
@@ -398,21 +947,43 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
mld2q->mld2q_cksum = 0;
mld2q->mld2q_resv1 = 0;
mld2q->mld2q_resv2 = 0;
- mld2q->mld2q_suppress = 0;
+ mld2q->mld2q_suppress = sflag;
mld2q->mld2q_qrv = 2;
- mld2q->mld2q_nsrcs = 0;
+ mld2q->mld2q_nsrcs = htons(llqt_srcs);
mld2q->mld2q_qqic = br->multicast_query_interval / HZ;
- mld2q->mld2q_mca = *grp;
- mld2q->mld2q_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- sizeof(*mld2q),
- IPPROTO_ICMPV6,
- csum_partial(mld2q,
- sizeof(*mld2q),
- 0));
+ mld2q->mld2q_mca = *group;
+ csum = &mld2q->mld2q_cksum;
+ csum_start = (void *)mld2q;
+ if (!pg || !with_srcs)
+ break;
+
+ llqt_srcs = 0;
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_llqt == time_after(ent->timer.expires,
+ llqt) &&
+ ent->src_query_rexmit_cnt > 0) {
+ mld2q->mld2q_srcs[llqt_srcs++] = ent->addr.src.ip6;
+ ent->src_query_rexmit_cnt--;
+ if (need_rexmit && ent->src_query_rexmit_cnt)
+ *need_rexmit = true;
+ }
+ }
+ if (WARN_ON(llqt_srcs != ntohs(mld2q->mld2q_nsrcs))) {
+ kfree_skb(skb);
+ return NULL;
+ }
break;
}
- skb_put(skb, mld_hdr_size);
+ if (WARN_ON(!csum || !csum_start)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ *csum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, mld_hdr_size,
+ IPPROTO_ICMPV6,
+ csum_partial(csum_start, mld_hdr_size, 0));
+ skb_put(skb, mld_hdr_size);
__skb_pull(skb, sizeof(*eth));
out:
@@ -421,16 +992,39 @@ out:
#endif
static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br,
- struct br_ip *addr,
- u8 *igmp_type)
+ struct net_bridge_port_group *pg,
+ struct br_ip *ip_dst,
+ struct br_ip *group,
+ bool with_srcs, bool over_lmqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
{
- switch (addr->proto) {
+ __be32 ip4_dst;
+
+ switch (group->proto) {
case htons(ETH_P_IP):
- return br_ip4_multicast_alloc_query(br, addr->u.ip4, igmp_type);
+ ip4_dst = ip_dst ? ip_dst->dst.ip4 : htonl(INADDR_ALLHOSTS_GROUP);
+ return br_ip4_multicast_alloc_query(br, pg,
+ ip4_dst, group->dst.ip4,
+ with_srcs, over_lmqt,
+ sflag, igmp_type,
+ need_rexmit);
#if IS_ENABLED(CONFIG_IPV6)
- case htons(ETH_P_IPV6):
- return br_ip6_multicast_alloc_query(br, &addr->u.ip6,
- igmp_type);
+ case htons(ETH_P_IPV6): {
+ struct in6_addr ip6_dst;
+
+ if (ip_dst)
+ ip6_dst = ip_dst->dst.ip6;
+ else
+ ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0,
+ htonl(1));
+
+ return br_ip6_multicast_alloc_query(br, pg,
+ &ip6_dst, &group->dst.ip6,
+ with_srcs, over_lmqt,
+ sflag, igmp_type,
+ need_rexmit);
+ }
#endif
}
return NULL;
@@ -457,6 +1051,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
mp->br = br;
mp->addr = *group;
+ mp->mcast_gc.destroy = br_multicast_destroy_mdb_entry;
timer_setup(&mp->timer, br_multicast_group_expired, 0);
err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode,
br_mdb_rht_params);
@@ -470,12 +1065,101 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
return mp;
}
+static void br_multicast_group_src_expired(struct timer_list *t)
+{
+ struct net_bridge_group_src *src = from_timer(src, t, timer);
+ struct net_bridge_port_group *pg;
+ struct net_bridge *br = src->br;
+
+ spin_lock(&br->multicast_lock);
+ if (hlist_unhashed(&src->node) || !netif_running(br->dev) ||
+ timer_pending(&src->timer))
+ goto out;
+
+ pg = src->pg;
+ if (pg->filter_mode == MCAST_INCLUDE) {
+ br_multicast_del_group_src(src);
+ if (!hlist_empty(&pg->src_list))
+ goto out;
+ br_multicast_find_del_pg(br, pg);
+ } else {
+ br_multicast_fwd_src_handle(src);
+ }
+
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static struct net_bridge_group_src *
+br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip)
+{
+ struct net_bridge_group_src *ent;
+
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (ip->src.ip4 == ent->addr.src.ip4)
+ return ent;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (!ipv6_addr_cmp(&ent->addr.src.ip6, &ip->src.ip6))
+ return ent;
+ break;
+#endif
+ }
+
+ return NULL;
+}
+
+static struct net_bridge_group_src *
+br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip)
+{
+ struct net_bridge_group_src *grp_src;
+
+ if (unlikely(pg->src_ents >= PG_SRC_ENT_LIMIT))
+ return NULL;
+
+ switch (src_ip->proto) {
+ case htons(ETH_P_IP):
+ if (ipv4_is_zeronet(src_ip->src.ip4) ||
+ ipv4_is_multicast(src_ip->src.ip4))
+ return NULL;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ if (ipv6_addr_any(&src_ip->src.ip6) ||
+ ipv6_addr_is_multicast(&src_ip->src.ip6))
+ return NULL;
+ break;
+#endif
+ }
+
+ grp_src = kzalloc(sizeof(*grp_src), GFP_ATOMIC);
+ if (unlikely(!grp_src))
+ return NULL;
+
+ grp_src->pg = pg;
+ grp_src->br = pg->key.port->br;
+ grp_src->addr = *src_ip;
+ grp_src->mcast_gc.destroy = br_multicast_destroy_group_src;
+ timer_setup(&grp_src->timer, br_multicast_group_src_expired, 0);
+
+ hlist_add_head_rcu(&grp_src->node, &pg->src_list);
+ pg->src_ents++;
+
+ return grp_src;
+}
+
struct net_bridge_port_group *br_multicast_new_port_group(
struct net_bridge_port *port,
struct br_ip *group,
struct net_bridge_port_group __rcu *next,
unsigned char flags,
- const unsigned char *src)
+ const unsigned char *src,
+ u8 filter_mode,
+ u8 rt_protocol)
{
struct net_bridge_port_group *p;
@@ -483,12 +1167,25 @@ struct net_bridge_port_group *br_multicast_new_port_group(
if (unlikely(!p))
return NULL;
- p->addr = *group;
- p->port = port;
+ p->key.addr = *group;
+ p->key.port = port;
p->flags = flags;
+ p->filter_mode = filter_mode;
+ p->rt_protocol = rt_protocol;
+ p->mcast_gc.destroy = br_multicast_destroy_port_group;
+ INIT_HLIST_HEAD(&p->src_list);
+
+ if (!br_multicast_is_star_g(group) &&
+ rhashtable_lookup_insert_fast(&port->br->sg_port_tbl, &p->rhnode,
+ br_sg_port_rht_params)) {
+ kfree(p);
+ return NULL;
+ }
+
rcu_assign_pointer(p->next, next);
- hlist_add_head(&p->mglist, &port->mglist);
timer_setup(&p->timer, br_multicast_port_group_expired, 0);
+ timer_setup(&p->rexmit_timer, br_multicast_port_group_rexmit, 0);
+ hlist_add_head(&p->mglist, &port->mglist);
if (src)
memcpy(p->eth_addr, src, ETH_ALEN);
@@ -498,27 +1195,19 @@ struct net_bridge_port_group *br_multicast_new_port_group(
return p;
}
-static bool br_port_group_equal(struct net_bridge_port_group *p,
- struct net_bridge_port *port,
- const unsigned char *src)
-{
- if (p->port != port)
- return false;
-
- if (!(port->flags & BR_MULTICAST_TO_UNICAST))
- return true;
-
- return ether_addr_equal(src, p->eth_addr);
-}
-
void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify)
{
if (!mp->host_joined) {
mp->host_joined = true;
+ if (br_multicast_is_star_g(&mp->addr))
+ br_multicast_star_g_host_state(mp);
if (notify)
- br_mdb_notify(mp->br->dev, NULL, &mp->addr,
- RTM_NEWMDB, 0);
+ br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB);
}
+
+ if (br_group_is_l2(&mp->addr))
+ return;
+
mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval);
}
@@ -528,30 +1217,33 @@ void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify)
return;
mp->host_joined = false;
+ if (br_multicast_is_star_g(&mp->addr))
+ br_multicast_star_g_host_state(mp);
if (notify)
- br_mdb_notify(mp->br->dev, NULL, &mp->addr, RTM_DELMDB, 0);
+ br_mdb_notify(mp->br->dev, mp, NULL, RTM_DELMDB);
}
-static int br_multicast_add_group(struct net_bridge *br,
- struct net_bridge_port *port,
- struct br_ip *group,
- const unsigned char *src)
+static struct net_bridge_port_group *
+__br_multicast_add_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct br_ip *group,
+ const unsigned char *src,
+ u8 filter_mode,
+ bool igmpv2_mldv1,
+ bool blocked)
{
struct net_bridge_port_group __rcu **pp;
- struct net_bridge_port_group *p;
+ struct net_bridge_port_group *p = NULL;
struct net_bridge_mdb_entry *mp;
unsigned long now = jiffies;
- int err;
- spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) ||
(port && port->state == BR_STATE_DISABLED))
goto out;
mp = br_multicast_new_group(br, group);
- err = PTR_ERR(mp);
if (IS_ERR(mp))
- goto err;
+ return ERR_PTR(PTR_ERR(mp));
if (!port) {
br_multicast_host_join(mp, true);
@@ -563,23 +1255,46 @@ static int br_multicast_add_group(struct net_bridge *br,
pp = &p->next) {
if (br_port_group_equal(p, port, src))
goto found;
- if ((unsigned long)p->port < (unsigned long)port)
+ if ((unsigned long)p->key.port < (unsigned long)port)
break;
}
- p = br_multicast_new_port_group(port, group, *pp, 0, src);
- if (unlikely(!p))
- goto err;
+ p = br_multicast_new_port_group(port, group, *pp, 0, src,
+ filter_mode, RTPROT_KERNEL);
+ if (unlikely(!p)) {
+ p = ERR_PTR(-ENOMEM);
+ goto out;
+ }
rcu_assign_pointer(*pp, p);
- br_mdb_notify(br->dev, port, group, RTM_NEWMDB, 0);
+ if (blocked)
+ p->flags |= MDB_PG_FLAGS_BLOCKED;
+ br_mdb_notify(br->dev, mp, p, RTM_NEWMDB);
found:
- mod_timer(&p->timer, now + br->multicast_membership_interval);
+ if (igmpv2_mldv1)
+ mod_timer(&p->timer, now + br->multicast_membership_interval);
+
out:
- err = 0;
+ return p;
+}
-err:
+static int br_multicast_add_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct br_ip *group,
+ const unsigned char *src,
+ u8 filter_mode,
+ bool igmpv2_mldv1)
+{
+ struct net_bridge_port_group *pg;
+ int err;
+
+ spin_lock(&br->multicast_lock);
+ pg = __br_multicast_add_group(br, port, group, src, filter_mode,
+ igmpv2_mldv1, false);
+ /* NULL is considered valid for host joined groups */
+ err = IS_ERR(pg) ? PTR_ERR(pg) : 0;
spin_unlock(&br->multicast_lock);
+
return err;
}
@@ -587,19 +1302,23 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
__be32 group,
__u16 vid,
- const unsigned char *src)
+ const unsigned char *src,
+ bool igmpv2)
{
struct br_ip br_group;
+ u8 filter_mode;
if (ipv4_is_local_multicast(group))
return 0;
memset(&br_group, 0, sizeof(br_group));
- br_group.u.ip4 = group;
+ br_group.dst.ip4 = group;
br_group.proto = htons(ETH_P_IP);
br_group.vid = vid;
+ filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE;
- return br_multicast_add_group(br, port, &br_group, src);
+ return br_multicast_add_group(br, port, &br_group, src, filter_mode,
+ igmpv2);
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -607,19 +1326,23 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
const struct in6_addr *group,
__u16 vid,
- const unsigned char *src)
+ const unsigned char *src,
+ bool mldv1)
{
struct br_ip br_group;
+ u8 filter_mode;
if (ipv6_addr_is_ll_all_nodes(group))
return 0;
memset(&br_group, 0, sizeof(br_group));
- br_group.u.ip6 = *group;
+ br_group.dst.ip6 = *group;
br_group.proto = htons(ETH_P_IPV6);
br_group.vid = vid;
+ filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE;
- return br_multicast_add_group(br, port, &br_group, src);
+ return br_multicast_add_group(br, port, &br_group, src, filter_mode,
+ mldv1);
}
#endif
@@ -702,21 +1425,30 @@ static void br_multicast_select_own_querier(struct net_bridge *br,
struct sk_buff *skb)
{
if (ip->proto == htons(ETH_P_IP))
- br->ip4_querier.addr.u.ip4 = ip_hdr(skb)->saddr;
+ br->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr;
#if IS_ENABLED(CONFIG_IPV6)
else
- br->ip6_querier.addr.u.ip6 = ipv6_hdr(skb)->saddr;
+ br->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr;
#endif
}
static void __br_multicast_send_query(struct net_bridge *br,
struct net_bridge_port *port,
- struct br_ip *ip)
-{
+ struct net_bridge_port_group *pg,
+ struct br_ip *ip_dst,
+ struct br_ip *group,
+ bool with_srcs,
+ u8 sflag,
+ bool *need_rexmit)
+{
+ bool over_lmqt = !!sflag;
struct sk_buff *skb;
u8 igmp_type;
- skb = br_multicast_alloc_query(br, ip, &igmp_type);
+again_under_lmqt:
+ skb = br_multicast_alloc_query(br, pg, ip_dst, group, with_srcs,
+ over_lmqt, sflag, &igmp_type,
+ need_rexmit);
if (!skb)
return;
@@ -727,8 +1459,13 @@ static void __br_multicast_send_query(struct net_bridge *br,
NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
dev_net(port->dev), NULL, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
+
+ if (over_lmqt && with_srcs && sflag) {
+ over_lmqt = false;
+ goto again_under_lmqt;
+ }
} else {
- br_multicast_select_own_querier(br, ip, skb);
+ br_multicast_select_own_querier(br, group, skb);
br_multicast_count(br, port, skb, igmp_type,
BR_MCAST_DIR_RX);
netif_rx(skb);
@@ -748,7 +1485,7 @@ static void br_multicast_send_query(struct net_bridge *br,
!br_opt_get(br, BROPT_MULTICAST_QUERIER))
return;
- memset(&br_group.u, 0, sizeof(br_group.u));
+ memset(&br_group.dst, 0, sizeof(br_group.dst));
if (port ? (own_query == &port->ip4_own_query) :
(own_query == &br->ip4_own_query)) {
@@ -764,7 +1501,8 @@ static void br_multicast_send_query(struct net_bridge *br,
if (!other_query || timer_pending(&other_query->timer))
return;
- __br_multicast_send_query(br, port, &br_group);
+ __br_multicast_send_query(br, port, NULL, NULL, &br_group, false, 0,
+ NULL);
time = jiffies;
time += own_query->startup_sent < br->multicast_startup_query_count ?
@@ -809,6 +1547,44 @@ static void br_ip6_multicast_port_query_expired(struct timer_list *t)
}
#endif
+static void br_multicast_port_group_rexmit(struct timer_list *t)
+{
+ struct net_bridge_port_group *pg = from_timer(pg, t, rexmit_timer);
+ struct bridge_mcast_other_query *other_query = NULL;
+ struct net_bridge *br = pg->key.port->br;
+ bool need_rexmit = false;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) ||
+ !br_opt_get(br, BROPT_MULTICAST_ENABLED) ||
+ !br_opt_get(br, BROPT_MULTICAST_QUERIER))
+ goto out;
+
+ if (pg->key.addr.proto == htons(ETH_P_IP))
+ other_query = &br->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &br->ip6_other_query;
+#endif
+
+ if (!other_query || timer_pending(&other_query->timer))
+ goto out;
+
+ if (pg->grp_query_rexmit_cnt) {
+ pg->grp_query_rexmit_cnt--;
+ __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr,
+ &pg->key.addr, false, 1, NULL);
+ }
+ __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr,
+ &pg->key.addr, true, 0, &need_rexmit);
+
+ if (pg->grp_query_rexmit_cnt || need_rexmit)
+ mod_timer(&pg->rexmit_timer, jiffies +
+ br->multicast_last_member_interval);
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
static void br_mc_disabled_update(struct net_device *dev, bool value)
{
struct switchdev_attr attr = {
@@ -847,13 +1623,16 @@ void br_multicast_del_port(struct net_bridge_port *port)
{
struct net_bridge *br = port->br;
struct net_bridge_port_group *pg;
+ HLIST_HEAD(deleted_head);
struct hlist_node *n;
/* Take care of the remaining groups, only perm ones should be left */
spin_lock_bh(&br->multicast_lock);
hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
- br_multicast_del_pg(br, pg);
+ br_multicast_find_del_pg(br, pg);
+ hlist_move_list(&br->mcast_gc_list, &deleted_head);
spin_unlock_bh(&br->multicast_lock);
+ br_multicast_gc(&deleted_head);
del_timer_sync(&port->multicast_router_timer);
free_percpu(port->mcast_stats);
}
@@ -901,7 +1680,7 @@ void br_multicast_disable_port(struct net_bridge_port *port)
spin_lock(&br->multicast_lock);
hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
if (!(pg->flags & MDB_PG_FLAGS_PERMANENT))
- br_multicast_del_pg(br, pg);
+ br_multicast_find_del_pg(br, pg);
__del_port_router(port);
@@ -913,20 +1692,574 @@ void br_multicast_disable_port(struct net_bridge_port *port)
spin_unlock(&br->multicast_lock);
}
+static int __grp_src_delete_marked(struct net_bridge_port_group *pg)
+{
+ struct net_bridge_group_src *ent;
+ struct hlist_node *tmp;
+ int deleted = 0;
+
+ hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node)
+ if (ent->flags & BR_SGRP_F_DELETE) {
+ br_multicast_del_group_src(ent);
+ deleted++;
+ }
+
+ return deleted;
+}
+
+static void __grp_src_mod_timer(struct net_bridge_group_src *src,
+ unsigned long expires)
+{
+ mod_timer(&src->timer, expires);
+ br_multicast_fwd_src_handle(src);
+}
+
+static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg)
+{
+ struct bridge_mcast_other_query *other_query = NULL;
+ struct net_bridge *br = pg->key.port->br;
+ u32 lmqc = br->multicast_last_member_count;
+ unsigned long lmqt, lmi, now = jiffies;
+ struct net_bridge_group_src *ent;
+
+ if (!netif_running(br->dev) ||
+ !br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ return;
+
+ if (pg->key.addr.proto == htons(ETH_P_IP))
+ other_query = &br->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &br->ip6_other_query;
+#endif
+
+ lmqt = now + br_multicast_lmqt(br);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (ent->flags & BR_SGRP_F_SEND) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ if (ent->timer.expires > lmqt) {
+ if (br_opt_get(br, BROPT_MULTICAST_QUERIER) &&
+ other_query &&
+ !timer_pending(&other_query->timer))
+ ent->src_query_rexmit_cnt = lmqc;
+ __grp_src_mod_timer(ent, lmqt);
+ }
+ }
+ }
+
+ if (!br_opt_get(br, BROPT_MULTICAST_QUERIER) ||
+ !other_query || timer_pending(&other_query->timer))
+ return;
+
+ __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr,
+ &pg->key.addr, true, 1, NULL);
+
+ lmi = now + br->multicast_last_member_interval;
+ if (!timer_pending(&pg->rexmit_timer) ||
+ time_after(pg->rexmit_timer.expires, lmi))
+ mod_timer(&pg->rexmit_timer, lmi);
+}
+
+static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg)
+{
+ struct bridge_mcast_other_query *other_query = NULL;
+ struct net_bridge *br = pg->key.port->br;
+ unsigned long now = jiffies, lmi;
+
+ if (!netif_running(br->dev) ||
+ !br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ return;
+
+ if (pg->key.addr.proto == htons(ETH_P_IP))
+ other_query = &br->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &br->ip6_other_query;
+#endif
+
+ if (br_opt_get(br, BROPT_MULTICAST_QUERIER) &&
+ other_query && !timer_pending(&other_query->timer)) {
+ lmi = now + br->multicast_last_member_interval;
+ pg->grp_query_rexmit_cnt = br->multicast_last_member_count - 1;
+ __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr,
+ &pg->key.addr, false, 0, NULL);
+ if (!timer_pending(&pg->rexmit_timer) ||
+ time_after(pg->rexmit_timer.expires, lmi))
+ mod_timer(&pg->rexmit_timer, lmi);
+ }
+
+ if (pg->filter_mode == MCAST_EXCLUDE &&
+ (!timer_pending(&pg->timer) ||
+ time_after(pg->timer.expires, now + br_multicast_lmqt(br))))
+ mod_timer(&pg->timer, now + br_multicast_lmqt(br));
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) IS_IN (B) INCLUDE (A+B) (B)=GMI
+ * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI
+ * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI
+ */
+static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (!ent) {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+
+ if (ent)
+ __grp_src_mod_timer(ent, now + br_multicast_gmi(br));
+ srcs += src_size;
+ }
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) IS_EX (B) EXCLUDE (A*B,B-A) (B-A)=0
+ * Delete (A-B)
+ * Group Timer=GMI
+ */
+static void __grp_src_isexc_incl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge_group_src *ent;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent)
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ else
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ br_multicast_fwd_src_handle(ent);
+ srcs += src_size;
+ }
+
+ __grp_src_delete_marked(pg);
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) IS_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=GMI
+ * Delete (X-A)
+ * Delete (Y-A)
+ * Group Timer=GMI
+ */
+static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ __grp_src_mod_timer(ent,
+ now + br_multicast_gmi(br));
+ changed = true;
+ }
+ }
+ srcs += src_size;
+ }
+
+ if (__grp_src_delete_marked(pg))
+ changed = true;
+
+ return changed;
+}
+
+static bool br_multicast_isexc(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->key.port->br;
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ __grp_src_isexc_incl(pg, srcs, nsrcs, src_size);
+ br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE);
+ changed = true;
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_isexc_excl(pg, srcs, nsrcs, src_size);
+ break;
+ }
+
+ pg->filter_mode = MCAST_EXCLUDE;
+ mod_timer(&pg->timer, jiffies + br_multicast_gmi(br));
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI
+ * Send Q(G,A-B)
+ */
+static bool __grp_src_toin_incl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->key.port->br;
+ u32 src_idx, to_send = pg->src_ents;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ to_send--;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+ if (ent)
+ __grp_src_mod_timer(ent, now + br_multicast_gmi(br));
+ srcs += src_size;
+ }
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) TO_IN (A) EXCLUDE (X+A,Y-A) (A)=GMI
+ * Send Q(G,X-A)
+ * Send Q(G)
+ */
+static bool __grp_src_toin_excl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->key.port->br;
+ u32 src_idx, to_send = pg->src_ents;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (timer_pending(&ent->timer))
+ ent->flags |= BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ if (timer_pending(&ent->timer)) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ to_send--;
+ }
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+ if (ent)
+ __grp_src_mod_timer(ent, now + br_multicast_gmi(br));
+ srcs += src_size;
+ }
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+
+ __grp_send_query_and_rexmit(pg);
+
+ return changed;
+}
+
+static bool br_multicast_toin(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ changed = __grp_src_toin_incl(pg, srcs, nsrcs, src_size);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_toin_excl(pg, srcs, nsrcs, src_size);
+ break;
+ }
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) TO_EX (B) EXCLUDE (A*B,B-A) (B-A)=0
+ * Delete (A-B)
+ * Send Q(G,A*B)
+ * Group Timer=GMI
+ */
+static void __grp_src_toex_incl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags = (ent->flags & ~BR_SGRP_F_DELETE) |
+ BR_SGRP_F_SEND;
+ to_send++;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ }
+ if (ent)
+ br_multicast_fwd_src_handle(ent);
+ srcs += src_size;
+ }
+
+ __grp_src_delete_marked(pg);
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) TO_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=Group Timer
+ * Delete (X-A)
+ * Delete (Y-A)
+ * Send Q(G,A-Y)
+ * Group Timer=GMI
+ */
+static bool __grp_src_toex_excl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ __grp_src_mod_timer(ent, pg->timer.expires);
+ changed = true;
+ }
+ }
+ if (ent && timer_pending(&ent->timer)) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ srcs += src_size;
+ }
+
+ if (__grp_src_delete_marked(pg))
+ changed = true;
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+
+ return changed;
+}
+
+static bool br_multicast_toex(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->key.port->br;
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ __grp_src_toex_incl(pg, srcs, nsrcs, src_size);
+ br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE);
+ changed = true;
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_toex_excl(pg, srcs, nsrcs, src_size);
+ break;
+ }
+
+ pg->filter_mode = MCAST_EXCLUDE;
+ mod_timer(&pg->timer, jiffies + br_multicast_gmi(br));
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B)
+ */
+static void __grp_src_block_incl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags &= ~BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ srcs += src_size;
+ }
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+
+ if (pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list))
+ br_multicast_find_del_pg(pg->key.port->br, pg);
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer
+ * Send Q(G,A-Y)
+ */
+static bool __grp_src_block_excl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags &= ~BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (!ent) {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ __grp_src_mod_timer(ent, pg->timer.expires);
+ changed = true;
+ }
+ }
+ if (ent && timer_pending(&ent->timer)) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ srcs += src_size;
+ }
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+
+ return changed;
+}
+
+static bool br_multicast_block(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ __grp_src_block_incl(pg, srcs, nsrcs, src_size);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_block_excl(pg, srcs, nsrcs, src_size);
+ break;
+ }
+
+ return changed;
+}
+
+static struct net_bridge_port_group *
+br_multicast_find_port(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port *p,
+ const unsigned char *src)
+{
+ struct net_bridge *br __maybe_unused = mp->br;
+ struct net_bridge_port_group *pg;
+
+ for (pg = mlock_dereference(mp->ports, br);
+ pg;
+ pg = mlock_dereference(pg->next, br))
+ if (br_port_group_equal(pg, p, src))
+ return pg;
+
+ return NULL;
+}
+
static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
struct net_bridge_port *port,
struct sk_buff *skb,
u16 vid)
{
+ bool igmpv2 = br->multicast_igmp_version == 2;
+ struct net_bridge_mdb_entry *mdst;
+ struct net_bridge_port_group *pg;
const unsigned char *src;
struct igmpv3_report *ih;
struct igmpv3_grec *grec;
- int i;
- int len;
- int num;
- int type;
- int err = 0;
+ int i, len, num, type;
+ bool changed = false;
__be32 group;
+ int err = 0;
u16 nsrcs;
ih = igmpv3_report_hdr(skb);
@@ -947,7 +2280,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
if (!ip_mc_may_pull(skb, len))
return -EINVAL;
- /* We treat this as an IGMPv2 report for now. */
switch (type) {
case IGMPV3_MODE_IS_INCLUDE:
case IGMPV3_MODE_IS_EXCLUDE:
@@ -962,16 +2294,62 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
}
src = eth_hdr(skb)->h_source;
- if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
- type == IGMPV3_MODE_IS_INCLUDE) &&
- nsrcs == 0) {
- br_ip4_multicast_leave_group(br, port, group, vid, src);
+ if (nsrcs == 0 &&
+ (type == IGMPV3_CHANGE_TO_INCLUDE ||
+ type == IGMPV3_MODE_IS_INCLUDE)) {
+ if (!port || igmpv2) {
+ br_ip4_multicast_leave_group(br, port, group, vid, src);
+ continue;
+ }
} else {
err = br_ip4_multicast_add_group(br, port, group, vid,
- src);
+ src, igmpv2);
if (err)
break;
}
+
+ if (!port || igmpv2)
+ continue;
+
+ spin_lock_bh(&br->multicast_lock);
+ mdst = br_mdb_ip4_get(br, group, vid);
+ if (!mdst)
+ goto unlock_continue;
+ pg = br_multicast_find_port(mdst, port, src);
+ if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT))
+ goto unlock_continue;
+ /* reload grec */
+ grec = (void *)(skb->data + len - sizeof(*grec) - (nsrcs * 4));
+ switch (type) {
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ changed = br_multicast_isinc_allow(pg, grec->grec_src,
+ nsrcs, sizeof(__be32));
+ break;
+ case IGMPV3_MODE_IS_INCLUDE:
+ changed = br_multicast_isinc_allow(pg, grec->grec_src, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_MODE_IS_EXCLUDE:
+ changed = br_multicast_isexc(pg, grec->grec_src, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ changed = br_multicast_toin(pg, grec->grec_src, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ changed = br_multicast_toex(pg, grec->grec_src, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ changed = br_multicast_block(pg, grec->grec_src, nsrcs,
+ sizeof(__be32));
+ break;
+ }
+ if (changed)
+ br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB);
+unlock_continue:
+ spin_unlock_bh(&br->multicast_lock);
}
return err;
@@ -983,14 +2361,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
struct sk_buff *skb,
u16 vid)
{
+ bool mldv1 = br->multicast_mld_version == 1;
+ struct net_bridge_mdb_entry *mdst;
+ struct net_bridge_port_group *pg;
unsigned int nsrcs_offset;
const unsigned char *src;
struct icmp6hdr *icmp6h;
struct mld2_grec *grec;
unsigned int grec_len;
- int i;
- int len;
- int num;
+ bool changed = false;
+ int i, len, num;
int err = 0;
if (!ipv6_mc_may_pull(skb, sizeof(*icmp6h)))
@@ -1024,7 +2404,6 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
grec = (struct mld2_grec *)(skb->data + len);
len += grec_len;
- /* We treat these as MLDv1 reports for now. */
switch (grec->grec_type) {
case MLD2_MODE_IS_INCLUDE:
case MLD2_MODE_IS_EXCLUDE:
@@ -1042,15 +2421,61 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
nsrcs == 0) {
- br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
- vid, src);
+ if (!port || mldv1) {
+ br_ip6_multicast_leave_group(br, port,
+ &grec->grec_mca,
+ vid, src);
+ continue;
+ }
} else {
err = br_ip6_multicast_add_group(br, port,
&grec->grec_mca, vid,
- src);
+ src, mldv1);
if (err)
break;
}
+
+ if (!port || mldv1)
+ continue;
+
+ spin_lock_bh(&br->multicast_lock);
+ mdst = br_mdb_ip6_get(br, &grec->grec_mca, vid);
+ if (!mdst)
+ goto unlock_continue;
+ pg = br_multicast_find_port(mdst, port, src);
+ if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT))
+ goto unlock_continue;
+ switch (grec->grec_type) {
+ case MLD2_ALLOW_NEW_SOURCES:
+ changed = br_multicast_isinc_allow(pg, grec->grec_src,
+ nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_MODE_IS_INCLUDE:
+ changed = br_multicast_isinc_allow(pg, grec->grec_src, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_MODE_IS_EXCLUDE:
+ changed = br_multicast_isexc(pg, grec->grec_src, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_CHANGE_TO_INCLUDE:
+ changed = br_multicast_toin(pg, grec->grec_src, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_CHANGE_TO_EXCLUDE:
+ changed = br_multicast_toex(pg, grec->grec_src, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_BLOCK_OLD_SOURCES:
+ changed = br_multicast_block(pg, grec->grec_src, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ }
+ if (changed)
+ br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB);
+unlock_continue:
+ spin_unlock_bh(&br->multicast_lock);
}
return err;
@@ -1065,16 +2490,16 @@ static bool br_ip4_multicast_select_querier(struct net_bridge *br,
!timer_pending(&br->ip4_other_query.timer))
goto update;
- if (!br->ip4_querier.addr.u.ip4)
+ if (!br->ip4_querier.addr.src.ip4)
goto update;
- if (ntohl(saddr) <= ntohl(br->ip4_querier.addr.u.ip4))
+ if (ntohl(saddr) <= ntohl(br->ip4_querier.addr.src.ip4))
goto update;
return false;
update:
- br->ip4_querier.addr.u.ip4 = saddr;
+ br->ip4_querier.addr.src.ip4 = saddr;
/* update protected by general multicast_lock by caller */
rcu_assign_pointer(br->ip4_querier.port, port);
@@ -1091,13 +2516,13 @@ static bool br_ip6_multicast_select_querier(struct net_bridge *br,
!timer_pending(&br->ip6_other_query.timer))
goto update;
- if (ipv6_addr_cmp(saddr, &br->ip6_querier.addr.u.ip6) <= 0)
+ if (ipv6_addr_cmp(saddr, &br->ip6_querier.addr.src.ip6) <= 0)
goto update;
return false;
update:
- br->ip6_querier.addr.u.ip6 = *saddr;
+ br->ip6_querier.addr.src.ip6 = *saddr;
/* update protected by general multicast_lock by caller */
rcu_assign_pointer(br->ip6_querier.port, port);
@@ -1112,10 +2537,10 @@ static bool br_multicast_select_querier(struct net_bridge *br,
{
switch (saddr->proto) {
case htons(ETH_P_IP):
- return br_ip4_multicast_select_querier(br, port, saddr->u.ip4);
+ return br_ip4_multicast_select_querier(br, port, saddr->src.ip4);
#if IS_ENABLED(CONFIG_IPV6)
case htons(ETH_P_IPV6):
- return br_ip6_multicast_select_querier(br, port, &saddr->u.ip6);
+ return br_ip6_multicast_select_querier(br, port, &saddr->src.ip6);
#endif
}
@@ -1245,7 +2670,8 @@ static void br_ip4_multicast_query(struct net_bridge *br,
}
} else if (transport_len >= sizeof(*ih3)) {
ih3 = igmpv3_query_hdr(skb);
- if (ih3->nsrcs)
+ if (ih3->nsrcs ||
+ (br->multicast_igmp_version == 3 && group && ih3->suppress))
goto out;
max_delay = ih3->code ?
@@ -1256,7 +2682,7 @@ static void br_ip4_multicast_query(struct net_bridge *br,
if (!group) {
saddr.proto = htons(ETH_P_IP);
- saddr.u.ip4 = iph->saddr;
+ saddr.src.ip4 = iph->saddr;
br_multicast_query_received(br, port, &br->ip4_other_query,
&saddr, max_delay);
@@ -1280,7 +2706,9 @@ static void br_ip4_multicast_query(struct net_bridge *br,
pp = &p->next) {
if (timer_pending(&p->timer) ?
time_after(p->timer.expires, now + max_delay) :
- try_to_del_timer_sync(&p->timer) >= 0)
+ try_to_del_timer_sync(&p->timer) >= 0 &&
+ (br->multicast_igmp_version == 2 ||
+ p->filter_mode == MCAST_EXCLUDE))
mod_timer(&p->timer, now + max_delay);
}
@@ -1330,6 +2758,10 @@ static int br_ip6_multicast_query(struct net_bridge *br,
mld2q = (struct mld2_query *)icmp6_hdr(skb);
if (!mld2q->mld2q_nsrcs)
group = &mld2q->mld2q_mca;
+ if (br->multicast_mld_version == 2 &&
+ !ipv6_addr_any(&mld2q->mld2q_mca) &&
+ mld2q->mld2q_suppress)
+ goto out;
max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL);
}
@@ -1338,7 +2770,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
if (is_general_query) {
saddr.proto = htons(ETH_P_IPV6);
- saddr.u.ip6 = ipv6_hdr(skb)->saddr;
+ saddr.src.ip6 = ipv6_hdr(skb)->saddr;
br_multicast_query_received(br, port, &br->ip6_other_query,
&saddr, max_delay);
@@ -1363,7 +2795,9 @@ static int br_ip6_multicast_query(struct net_bridge *br,
pp = &p->next) {
if (timer_pending(&p->timer) ?
time_after(p->timer.expires, now + max_delay) :
- try_to_del_timer_sync(&p->timer) >= 0)
+ try_to_del_timer_sync(&p->timer) >= 0 &&
+ (br->multicast_mld_version == 1 ||
+ p->filter_mode == MCAST_EXCLUDE))
mod_timer(&p->timer, now + max_delay);
}
@@ -1407,16 +2841,8 @@ br_multicast_leave_group(struct net_bridge *br,
if (p->flags & MDB_PG_FLAGS_PERMANENT)
break;
- rcu_assign_pointer(*pp, p->next);
- hlist_del_init(&p->mglist);
- del_timer(&p->timer);
- kfree_rcu(p, rcu);
- br_mdb_notify(br->dev, port, group, RTM_DELMDB,
- p->flags | MDB_PG_FLAGS_FAST_LEAVE);
-
- if (!mp->ports && !mp->host_joined &&
- netif_running(br->dev))
- mod_timer(&mp->timer, jiffies);
+ p->flags |= MDB_PG_FLAGS_FAST_LEAVE;
+ br_multicast_del_pg(mp, p, pp);
}
goto out;
}
@@ -1425,7 +2851,8 @@ br_multicast_leave_group(struct net_bridge *br,
goto out;
if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) {
- __br_multicast_send_query(br, port, &mp->addr);
+ __br_multicast_send_query(br, port, NULL, NULL, &mp->addr,
+ false, 0, NULL);
time = jiffies + br->multicast_last_member_count *
br->multicast_last_member_interval;
@@ -1467,7 +2894,7 @@ br_multicast_leave_group(struct net_bridge *br,
for (p = mlock_dereference(mp->ports, br);
p != NULL;
p = mlock_dereference(p->next, br)) {
- if (p->port != port)
+ if (p->key.port != port)
continue;
if (!hlist_unhashed(&p->mglist) &&
@@ -1498,7 +2925,7 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
own_query = port ? &port->ip4_own_query : &br->ip4_own_query;
memset(&br_group, 0, sizeof(br_group));
- br_group.u.ip4 = group;
+ br_group.dst.ip4 = group;
br_group.proto = htons(ETH_P_IP);
br_group.vid = vid;
@@ -1522,7 +2949,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
own_query = port ? &port->ip6_own_query : &br->ip6_own_query;
memset(&br_group, 0, sizeof(br_group));
- br_group.u.ip6 = *group;
+ br_group.dst.ip6 = *group;
br_group.proto = htons(ETH_P_IPV6);
br_group.vid = vid;
@@ -1627,7 +3054,8 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
- err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
+ err = br_ip4_multicast_add_group(br, port, ih->group, vid, src,
+ true);
break;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
err = br_ip4_multicast_igmp3_report(br, port, skb, vid);
@@ -1706,7 +3134,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
src = eth_hdr(skb)->h_source;
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid,
- src);
+ src, true);
break;
case ICMPV6_MLD2_REPORT:
err = br_ip6_multicast_mld2_report(br, port, skb, vid);
@@ -1781,6 +3209,19 @@ static void br_ip6_multicast_query_expired(struct timer_list *t)
}
#endif
+static void br_multicast_gc_work(struct work_struct *work)
+{
+ struct net_bridge *br = container_of(work, struct net_bridge,
+ mcast_gc_work);
+ HLIST_HEAD(deleted_head);
+
+ spin_lock_bh(&br->multicast_lock);
+ hlist_move_list(&br->mcast_gc_list, &deleted_head);
+ spin_unlock_bh(&br->multicast_lock);
+
+ br_multicast_gc(&deleted_head);
+}
+
void br_multicast_init(struct net_bridge *br)
{
br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX;
@@ -1821,6 +3262,8 @@ void br_multicast_init(struct net_bridge *br)
br_ip6_multicast_query_expired, 0);
#endif
INIT_HLIST_HEAD(&br->mdb_list);
+ INIT_HLIST_HEAD(&br->mcast_gc_list);
+ INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work);
}
static void br_ip4_multicast_join_snoopers(struct net_bridge *br)
@@ -1848,7 +3291,7 @@ static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br)
}
#endif
-static void br_multicast_join_snoopers(struct net_bridge *br)
+void br_multicast_join_snoopers(struct net_bridge *br)
{
br_ip4_multicast_join_snoopers(br);
br_ip6_multicast_join_snoopers(br);
@@ -1879,7 +3322,7 @@ static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br)
}
#endif
-static void br_multicast_leave_snoopers(struct net_bridge *br)
+void br_multicast_leave_snoopers(struct net_bridge *br)
{
br_ip4_multicast_leave_snoopers(br);
br_ip6_multicast_leave_snoopers(br);
@@ -1898,9 +3341,6 @@ static void __br_multicast_open(struct net_bridge *br,
void br_multicast_open(struct net_bridge *br)
{
- if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
- br_multicast_join_snoopers(br);
-
__br_multicast_open(br, &br->ip4_own_query);
#if IS_ENABLED(CONFIG_IPV6)
__br_multicast_open(br, &br->ip6_own_query);
@@ -1916,26 +3356,23 @@ void br_multicast_stop(struct net_bridge *br)
del_timer_sync(&br->ip6_other_query.timer);
del_timer_sync(&br->ip6_own_query.timer);
#endif
-
- if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
- br_multicast_leave_snoopers(br);
}
void br_multicast_dev_del(struct net_bridge *br)
{
struct net_bridge_mdb_entry *mp;
+ HLIST_HEAD(deleted_head);
struct hlist_node *tmp;
spin_lock_bh(&br->multicast_lock);
- hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) {
- del_timer(&mp->timer);
- rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode,
- br_mdb_rht_params);
- hlist_del_rcu(&mp->mdb_node);
- kfree_rcu(mp, rcu);
- }
+ hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node)
+ br_multicast_del_mdb_entry(mp);
+ hlist_move_list(&br->mcast_gc_list, &deleted_head);
spin_unlock_bh(&br->multicast_lock);
+ br_multicast_gc(&deleted_head);
+ cancel_work_sync(&br->mcast_gc_work);
+
rcu_barrier();
}
@@ -2049,6 +3486,7 @@ static void br_multicast_start_querier(struct net_bridge *br,
int br_multicast_toggle(struct net_bridge *br, unsigned long val)
{
struct net_bridge_port *port;
+ bool change_snoopers = false;
spin_lock_bh(&br->multicast_lock);
if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val)
@@ -2057,7 +3495,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
br_mc_disabled_update(br->dev, val);
br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val);
if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) {
- br_multicast_leave_snoopers(br);
+ change_snoopers = true;
goto unlock;
}
@@ -2068,9 +3506,30 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
list_for_each_entry(port, &br->port_list, list)
__br_multicast_enable_port(port);
+ change_snoopers = true;
+
unlock:
spin_unlock_bh(&br->multicast_lock);
+ /* br_multicast_join_snoopers has the potential to cause
+ * an MLD Report/Leave to be delivered to br_multicast_rcv,
+ * which would in turn call br_multicast_add_group, which would
+ * attempt to acquire multicast_lock. This function should be
+ * called after the lock has been released to avoid deadlocks on
+ * multicast_lock.
+ *
+ * br_multicast_leave_snoopers does not have the problem since
+ * br_multicast_rcv first checks BROPT_MULTICAST_ENABLED, and
+ * returns without calling br_multicast_ipv4/6_rcv if it's not
+ * enabled. Moved both functions out just for symmetry.
+ */
+ if (change_snoopers) {
+ if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ br_multicast_join_snoopers(br);
+ else
+ br_multicast_leave_snoopers(br);
+ }
+
return 0;
}
@@ -2211,7 +3670,7 @@ int br_multicast_list_adjacent(struct net_device *dev,
if (!entry)
goto unlock;
- entry->addr = group->addr;
+ entry->addr = group->key.addr;
list_add(&entry->list, br_ip_list);
count++;
}
@@ -2252,7 +3711,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
memset(&eth, 0, sizeof(eth));
eth.h_proto = htons(proto);
- ret = br_multicast_querier_exists(br, &eth);
+ ret = br_multicast_querier_exists(br, &eth, NULL);
unlock:
rcu_read_unlock();
@@ -2468,10 +3927,23 @@ void br_multicast_get_stats(const struct net_bridge *br,
int br_mdb_hash_init(struct net_bridge *br)
{
- return rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params);
+ int err;
+
+ err = rhashtable_init(&br->sg_port_tbl, &br_sg_port_rht_params);
+ if (err)
+ return err;
+
+ err = rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params);
+ if (err) {
+ rhashtable_destroy(&br->sg_port_tbl);
+ return err;
+ }
+
+ return 0;
}
void br_mdb_hash_fini(struct net_bridge *br)
{
+ rhashtable_destroy(&br->sg_port_tbl);
rhashtable_destroy(&br->mdb_hash_tbl);
}
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 04c3f9a82650..8edfb98ae1d5 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -735,6 +735,11 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
mtu_reserved = nf_bridge_mtu_reduction(skb);
mtu = skb->dev->mtu;
+ if (nf_bridge->pkt_otherhost) {
+ skb->pkt_type = PACKET_OTHERHOST;
+ nf_bridge->pkt_otherhost = false;
+ }
+
if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu)
mtu = nf_bridge->frag_max_size;
@@ -835,8 +840,6 @@ static unsigned int br_nf_post_routing(void *priv,
else
return NF_ACCEPT;
- /* We assume any code from br_dev_queue_push_xmit onwards doesn't care
- * about the value of skb->pkt_type. */
if (skb->pkt_type == PACKET_OTHERHOST) {
skb->pkt_type = PACKET_HOST;
nf_bridge->pkt_otherhost = true;
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index da310f0ca725..49700ce0e919 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -16,6 +16,7 @@
#include "br_private.h"
#include "br_private_stp.h"
+#include "br_private_cfm.h"
#include "br_private_tunnel.h"
static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
@@ -93,9 +94,11 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
{
struct net_bridge_vlan_group *vg = NULL;
struct net_bridge_port *p = NULL;
- struct net_bridge *br;
- int num_vlan_infos;
+ struct net_bridge *br = NULL;
+ u32 num_cfm_peer_mep_infos;
+ u32 num_cfm_mep_infos;
size_t vinfo_sz = 0;
+ int num_vlan_infos;
rcu_read_lock();
if (netif_is_bridge_port(dev)) {
@@ -114,6 +117,49 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
/* Each VLAN is returned in bridge_vlan_info along with flags */
vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+ if (!(filter_mask & RTEXT_FILTER_CFM_STATUS))
+ return vinfo_sz;
+
+ if (!br)
+ return vinfo_sz;
+
+ /* CFM status info must be added */
+ br_cfm_mep_count(br, &num_cfm_mep_infos);
+ br_cfm_peer_mep_count(br, &num_cfm_peer_mep_infos);
+
+ vinfo_sz += nla_total_size(0); /* IFLA_BRIDGE_CFM */
+ /* For each status struct the MEP instance (u32) is added */
+ /* MEP instance (u32) + br_cfm_mep_status */
+ vinfo_sz += num_cfm_mep_infos *
+ /*IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE */
+ (nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN */
+ + nla_total_size(sizeof(u32)));
+ /* MEP instance (u32) + br_cfm_cc_peer_status */
+ vinfo_sz += num_cfm_peer_mep_infos *
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE */
+ (nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE */
+ + nla_total_size(sizeof(u8))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE */
+ + nla_total_size(sizeof(u8))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN */
+ + nla_total_size(sizeof(u32)));
+
return vinfo_sz;
}
@@ -377,7 +423,8 @@ nla_put_failure:
static int br_fill_ifinfo(struct sk_buff *skb,
const struct net_bridge_port *port,
u32 pid, u32 seq, int event, unsigned int flags,
- u32 filter_mask, const struct net_device *dev)
+ u32 filter_mask, const struct net_device *dev,
+ bool getlink)
{
u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
struct nlattr *af = NULL;
@@ -426,7 +473,9 @@ static int br_fill_ifinfo(struct sk_buff *skb,
if (filter_mask & (RTEXT_FILTER_BRVLAN |
RTEXT_FILTER_BRVLAN_COMPRESSED |
- RTEXT_FILTER_MRP)) {
+ RTEXT_FILTER_MRP |
+ RTEXT_FILTER_CFM_CONFIG |
+ RTEXT_FILTER_CFM_STATUS)) {
af = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
if (!af)
goto nla_put_failure;
@@ -475,6 +524,36 @@ static int br_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
}
+ if (filter_mask & (RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS)) {
+ struct nlattr *cfm_nest = NULL;
+ int err;
+
+ if (!br_cfm_created(br) || port)
+ goto done;
+
+ cfm_nest = nla_nest_start(skb, IFLA_BRIDGE_CFM);
+ if (!cfm_nest)
+ goto nla_put_failure;
+
+ if (filter_mask & RTEXT_FILTER_CFM_CONFIG) {
+ rcu_read_lock();
+ err = br_cfm_config_fill_info(skb, br);
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (filter_mask & RTEXT_FILTER_CFM_STATUS) {
+ rcu_read_lock();
+ err = br_cfm_status_fill_info(skb, br, getlink);
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, cfm_nest);
+ }
+
done:
if (af)
nla_nest_end(skb, af);
@@ -486,11 +565,9 @@ nla_put_failure:
return -EMSGSIZE;
}
-/* Notify listeners of a change in bridge or port information */
-void br_ifinfo_notify(int event, const struct net_bridge *br,
- const struct net_bridge_port *port)
+void br_info_notify(int event, const struct net_bridge *br,
+ const struct net_bridge_port *port, u32 filter)
{
- u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED;
struct net_device *dev;
struct sk_buff *skb;
int err = -ENOBUFS;
@@ -515,7 +592,7 @@ void br_ifinfo_notify(int event, const struct net_bridge *br,
if (skb == NULL)
goto errout;
- err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev);
+ err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev, false);
if (err < 0) {
/* -EMSGSIZE implies BUG in br_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -528,6 +605,15 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_LINK, err);
}
+/* Notify listeners of a change in bridge or port information */
+void br_ifinfo_notify(int event, const struct net_bridge *br,
+ const struct net_bridge_port *port)
+{
+ u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED;
+
+ return br_info_notify(event, br, port, filter);
+}
+
/*
* Dump information about all ports, in response to GETLINK
*/
@@ -538,11 +624,13 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) &&
!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) &&
- !(filter_mask & RTEXT_FILTER_MRP))
+ !(filter_mask & RTEXT_FILTER_MRP) &&
+ !(filter_mask & RTEXT_FILTER_CFM_CONFIG) &&
+ !(filter_mask & RTEXT_FILTER_CFM_STATUS))
return 0;
return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags,
- filter_mask, dev);
+ filter_mask, dev, true);
}
static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
@@ -700,6 +788,11 @@ static int br_afspec(struct net_bridge *br,
if (err)
return err;
break;
+ case IFLA_BRIDGE_CFM:
+ err = br_cfm_parse(br, p, attr, cmd, extack);
+ if (err)
+ return err;
+ break;
}
}
@@ -1091,8 +1184,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
[IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
[IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
- [IFLA_BR_MULTI_BOOLOPT] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct br_boolopt_multi) },
+ [IFLA_BR_MULTI_BOOLOPT] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)),
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1631,7 +1724,7 @@ static int br_fill_linkxstats(struct sk_buff *skb,
pvid = br_get_pvid(vg);
list_for_each_entry(v, &vg->vlan_list, vlist) {
struct bridge_vlan_xstats vxi;
- struct br_vlan_stats stats;
+ struct pcpu_sw_netstats stats;
if (++vl_idx < *prividx)
continue;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index baa1500f384f..d62c6e1af64a 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -89,14 +89,6 @@ struct bridge_mcast_stats {
};
#endif
-struct br_vlan_stats {
- u64 rx_bytes;
- u64 rx_packets;
- u64 tx_bytes;
- u64 tx_packets;
- struct u64_stats_sync syncp;
-};
-
struct br_tunnel_info {
__be64 tunnel_id;
struct metadata_dst *tunnel_dst;
@@ -137,7 +129,7 @@ struct net_bridge_vlan {
u16 flags;
u16 priv_flags;
u8 state;
- struct br_vlan_stats __percpu *stats;
+ struct pcpu_sw_netstats __percpu *stats;
union {
struct net_bridge *br;
struct net_bridge_port *port;
@@ -213,27 +205,71 @@ struct net_bridge_fdb_entry {
#define MDB_PG_FLAGS_PERMANENT BIT(0)
#define MDB_PG_FLAGS_OFFLOAD BIT(1)
#define MDB_PG_FLAGS_FAST_LEAVE BIT(2)
+#define MDB_PG_FLAGS_STAR_EXCL BIT(3)
+#define MDB_PG_FLAGS_BLOCKED BIT(4)
-struct net_bridge_port_group {
- struct net_bridge_port *port;
- struct net_bridge_port_group __rcu *next;
- struct hlist_node mglist;
- struct rcu_head rcu;
+#define PG_SRC_ENT_LIMIT 32
+
+#define BR_SGRP_F_DELETE BIT(0)
+#define BR_SGRP_F_SEND BIT(1)
+#define BR_SGRP_F_INSTALLED BIT(2)
+
+struct net_bridge_mcast_gc {
+ struct hlist_node gc_node;
+ void (*destroy)(struct net_bridge_mcast_gc *gc);
+};
+
+struct net_bridge_group_src {
+ struct hlist_node node;
+
+ struct br_ip addr;
+ struct net_bridge_port_group *pg;
+ u8 flags;
+ u8 src_query_rexmit_cnt;
struct timer_list timer;
+
+ struct net_bridge *br;
+ struct net_bridge_mcast_gc mcast_gc;
+ struct rcu_head rcu;
+};
+
+struct net_bridge_port_group_sg_key {
+ struct net_bridge_port *port;
struct br_ip addr;
+};
+
+struct net_bridge_port_group {
+ struct net_bridge_port_group __rcu *next;
+ struct net_bridge_port_group_sg_key key;
unsigned char eth_addr[ETH_ALEN] __aligned(2);
unsigned char flags;
+ unsigned char filter_mode;
+ unsigned char grp_query_rexmit_cnt;
+ unsigned char rt_protocol;
+
+ struct hlist_head src_list;
+ unsigned int src_ents;
+ struct timer_list timer;
+ struct timer_list rexmit_timer;
+ struct hlist_node mglist;
+
+ struct rhash_head rhnode;
+ struct net_bridge_mcast_gc mcast_gc;
+ struct rcu_head rcu;
};
struct net_bridge_mdb_entry {
struct rhash_head rhnode;
struct net_bridge *br;
struct net_bridge_port_group __rcu *ports;
- struct rcu_head rcu;
- struct timer_list timer;
struct br_ip addr;
bool host_joined;
+
+ struct timer_list timer;
struct hlist_node mdb_node;
+
+ struct net_bridge_mcast_gc mcast_gc;
+ struct rcu_head rcu;
};
struct net_bridge_port {
@@ -339,9 +375,8 @@ enum net_bridge_opts {
struct net_bridge {
spinlock_t lock;
spinlock_t hash_lock;
- struct list_head port_list;
+ struct hlist_head frame_type_list;
struct net_device *dev;
- struct pcpu_sw_netstats __percpu *stats;
unsigned long options;
/* These fields are accessed on each packet */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
@@ -351,6 +386,7 @@ struct net_bridge {
#endif
struct rhashtable fdb_hash_tbl;
+ struct list_head port_list;
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
union {
struct rtable fake_rtable;
@@ -405,7 +441,9 @@ struct net_bridge {
unsigned long multicast_startup_query_interval;
struct rhashtable mdb_hash_tbl;
+ struct rhashtable sg_port_tbl;
+ struct hlist_head mcast_gc_list;
struct hlist_head mdb_list;
struct hlist_head router_list;
@@ -419,6 +457,7 @@ struct net_bridge {
struct bridge_mcast_own_query ip6_own_query;
struct bridge_mcast_querier ip6_querier;
#endif /* IS_ENABLED(CONFIG_IPV6) */
+ struct work_struct mcast_gc_work;
#endif
struct timer_list hello_timer;
@@ -434,7 +473,10 @@ struct net_bridge {
struct hlist_head fdb_list;
#if IS_ENABLED(CONFIG_BRIDGE_MRP)
- struct list_head mrp_list;
+ struct hlist_head mrp_list;
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+ struct hlist_head mep_list;
#endif
};
@@ -708,6 +750,16 @@ int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev);
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
rx_handler_func_t *br_get_rx_handler(const struct net_device *dev);
+struct br_frame_type {
+ __be16 type;
+ int (*frame_handler)(struct net_bridge_port *port,
+ struct sk_buff *skb);
+ struct hlist_node list;
+};
+
+void br_add_frame(struct net_bridge *br, struct br_frame_type *ft);
+void br_del_frame(struct net_bridge *br, struct br_frame_type *ft);
+
static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
{
return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev);
@@ -745,6 +797,8 @@ void br_multicast_del_port(struct net_bridge_port *port);
void br_multicast_enable_port(struct net_bridge_port *port);
void br_multicast_disable_port(struct net_bridge_port *port);
void br_multicast_init(struct net_bridge *br);
+void br_multicast_join_snoopers(struct net_bridge *br);
+void br_multicast_leave_snoopers(struct net_bridge *br);
void br_multicast_open(struct net_bridge *br);
void br_multicast_stop(struct net_bridge *br);
void br_multicast_dev_del(struct net_bridge *br);
@@ -766,13 +820,17 @@ br_multicast_new_group(struct net_bridge *br, struct br_ip *group);
struct net_bridge_port_group *
br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
struct net_bridge_port_group __rcu *next,
- unsigned char flags, const unsigned char *src);
+ unsigned char flags, const unsigned char *src,
+ u8 filter_mode, u8 rt_protocol);
int br_mdb_hash_init(struct net_bridge *br);
void br_mdb_hash_fini(struct net_bridge *br);
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type, u8 flags);
+void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg, int type);
void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
int type);
+void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_port_group __rcu **pp);
void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
const struct sk_buff *skb, u8 type, u8 dir);
int br_multicast_init_stats(struct net_bridge *br);
@@ -784,6 +842,15 @@ void br_mdb_init(void);
void br_mdb_uninit(void);
void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify);
void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify);
+void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg,
+ u8 filter_mode);
+void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp,
+ struct net_bridge_port_group *sg);
+
+static inline bool br_group_is_l2(const struct br_ip *group)
+{
+ return group->proto == 0;
+}
#define mlock_dereference(X, br) \
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
@@ -816,7 +883,8 @@ __br_multicast_querier_exists(struct net_bridge *br,
}
static inline bool br_multicast_querier_exists(struct net_bridge *br,
- struct ethhdr *eth)
+ struct ethhdr *eth,
+ const struct net_bridge_mdb_entry *mdb)
{
switch (eth->h_proto) {
case (htons(ETH_P_IP)):
@@ -828,6 +896,35 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br,
&br->ip6_other_query, true);
#endif
default:
+ return !!mdb && br_group_is_l2(&mdb->addr);
+ }
+}
+
+static inline bool br_multicast_is_star_g(const struct br_ip *ip)
+{
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ return ipv4_is_zeronet(ip->src.ip4);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return ipv6_addr_any(&ip->src.ip6);
+#endif
+ default:
+ return false;
+ }
+}
+
+static inline bool br_multicast_should_handle_mode(const struct net_bridge *br,
+ __be16 proto)
+{
+ switch (proto) {
+ case htons(ETH_P_IP):
+ return !!(br->multicast_igmp_version == 3);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return !!(br->multicast_mld_version == 2);
+#endif
+ default:
return false;
}
}
@@ -836,6 +933,19 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb)
{
return BR_INPUT_SKB_CB(skb)->igmp;
}
+
+static inline unsigned long br_multicast_lmqt(const struct net_bridge *br)
+{
+ return br->multicast_last_member_interval *
+ br->multicast_last_member_count;
+}
+
+static inline unsigned long br_multicast_gmi(const struct net_bridge *br)
+{
+ /* use the RFC default of 2 for QRV */
+ return 2 * br->multicast_query_interval +
+ br->multicast_query_response_interval;
+}
#else
static inline int br_multicast_rcv(struct net_bridge *br,
struct net_bridge_port *port,
@@ -872,6 +982,14 @@ static inline void br_multicast_init(struct net_bridge *br)
{
}
+static inline void br_multicast_join_snoopers(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_leave_snoopers(struct net_bridge *br)
+{
+}
+
static inline void br_multicast_open(struct net_bridge *br)
{
}
@@ -896,7 +1014,8 @@ static inline bool br_multicast_is_router(struct net_bridge *br)
}
static inline bool br_multicast_querier_exists(struct net_bridge *br,
- struct ethhdr *eth)
+ struct ethhdr *eth,
+ const struct net_bridge_mdb_entry *mdb)
{
return false;
}
@@ -975,7 +1094,7 @@ void nbp_vlan_flush(struct net_bridge_port *port);
int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack);
int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask);
void br_vlan_get_stats(const struct net_bridge_vlan *v,
- struct br_vlan_stats *stats);
+ struct pcpu_sw_netstats *stats);
void br_vlan_port_event(struct net_bridge_port *p, unsigned long event);
int br_vlan_bridge_event(struct net_device *dev, unsigned long event,
void *ptr);
@@ -1171,7 +1290,7 @@ static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu(
}
static inline void br_vlan_get_stats(const struct net_bridge_vlan *v,
- struct br_vlan_stats *stats)
+ struct pcpu_sw_netstats *stats)
{
}
@@ -1320,7 +1439,6 @@ extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr)
#if IS_ENABLED(CONFIG_BRIDGE_MRP)
int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
struct nlattr *attr, int cmd, struct netlink_ext_ack *extack);
-int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb);
bool br_mrp_enabled(struct net_bridge *br);
void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p);
int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br);
@@ -1332,11 +1450,6 @@ static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
return -EOPNOTSUPP;
}
-static inline int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb)
-{
- return 0;
-}
-
static inline bool br_mrp_enabled(struct net_bridge *br)
{
return false;
@@ -1354,12 +1467,67 @@ static inline int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br)
#endif
+/* br_cfm.c */
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd, struct netlink_ext_ack *extack);
+bool br_cfm_created(struct net_bridge *br);
+void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p);
+int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br);
+int br_cfm_status_fill_info(struct sk_buff *skb,
+ struct net_bridge *br,
+ bool getlink);
+int br_cfm_mep_count(struct net_bridge *br, u32 *count);
+int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count);
+#else
+static inline int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool br_cfm_created(struct net_bridge *br)
+{
+ return false;
+}
+
+static inline void br_cfm_port_del(struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+}
+
+static inline int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_cfm_status_fill_info(struct sk_buff *skb,
+ struct net_bridge *br,
+ bool getlink)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_cfm_mep_count(struct net_bridge *br, u32 *count)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
/* br_netlink.c */
extern struct rtnl_link_ops br_link_ops;
int br_netlink_init(void);
void br_netlink_fini(void);
void br_ifinfo_notify(int event, const struct net_bridge *br,
const struct net_bridge_port *port);
+void br_info_notify(int event, const struct net_bridge *br,
+ const struct net_bridge_port *port, u32 filter);
int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags,
struct netlink_ext_ack *extack);
int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
diff --git a/net/bridge/br_private_cfm.h b/net/bridge/br_private_cfm.h
new file mode 100644
index 000000000000..a43a5e7fa2c3
--- /dev/null
+++ b/net/bridge/br_private_cfm.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _BR_PRIVATE_CFM_H_
+#define _BR_PRIVATE_CFM_H_
+
+#include "br_private.h"
+#include <uapi/linux/cfm_bridge.h>
+
+struct br_cfm_mep_create {
+ enum br_cfm_domain domain; /* Domain for this MEP */
+ enum br_cfm_mep_direction direction; /* Up or Down MEP direction */
+ u32 ifindex; /* Residence port */
+};
+
+int br_cfm_mep_create(struct net_bridge *br,
+ const u32 instance,
+ struct br_cfm_mep_create *const create,
+ struct netlink_ext_ack *extack);
+
+int br_cfm_mep_delete(struct net_bridge *br,
+ const u32 instance,
+ struct netlink_ext_ack *extack);
+
+struct br_cfm_mep_config {
+ u32 mdlevel;
+ u32 mepid; /* MEPID for this MEP */
+ struct mac_addr unicast_mac; /* The MEP unicast MAC */
+};
+
+int br_cfm_mep_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_mep_config *const config,
+ struct netlink_ext_ack *extack);
+
+struct br_cfm_maid {
+ u8 data[CFM_MAID_LENGTH];
+};
+
+struct br_cfm_cc_config {
+ /* Expected received CCM PDU MAID. */
+ struct br_cfm_maid exp_maid;
+
+ /* Expected received CCM PDU interval. */
+ /* Transmitting CCM PDU interval when CCM tx is enabled. */
+ enum br_cfm_ccm_interval exp_interval;
+
+ bool enable; /* Enable/disable CCM PDU handling */
+};
+
+int br_cfm_cc_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_cc_config *const config,
+ struct netlink_ext_ack *extack);
+
+int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance,
+ u32 peer_mep_id,
+ struct netlink_ext_ack *extack);
+int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance,
+ u32 peer_mep_id,
+ struct netlink_ext_ack *extack);
+
+/* Transmitted CCM Remote Defect Indication status set.
+ * This RDI is inserted in transmitted CCM PDUs if CCM transmission is enabled.
+ * See br_cfm_cc_ccm_tx() with interval != BR_CFM_CCM_INTERVAL_NONE
+ */
+int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance,
+ const bool rdi, struct netlink_ext_ack *extack);
+
+/* OAM PDU Tx information */
+struct br_cfm_cc_ccm_tx_info {
+ struct mac_addr dmac;
+ /* The CCM will be transmitted for this period in seconds.
+ * Call br_cfm_cc_ccm_tx before timeout to keep transmission alive.
+ * When period is zero any ongoing transmission will be stopped.
+ */
+ u32 period;
+
+ bool seq_no_update; /* Update Tx CCM sequence number */
+ bool if_tlv; /* Insert Interface Status TLV */
+ u8 if_tlv_value; /* Interface Status TLV value */
+ bool port_tlv; /* Insert Port Status TLV */
+ u8 port_tlv_value; /* Port Status TLV value */
+ /* Sender ID TLV ??
+ * Organization-Specific TLV ??
+ */
+};
+
+int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance,
+ const struct br_cfm_cc_ccm_tx_info *const tx_info,
+ struct netlink_ext_ack *extack);
+
+struct br_cfm_mep_status {
+ /* Indications that an OAM PDU has been seen. */
+ bool opcode_unexp_seen; /* RX of OAM PDU with unexpected opcode */
+ bool version_unexp_seen; /* RX of OAM PDU with unexpected version */
+ bool rx_level_low_seen; /* Rx of OAM PDU with level low */
+};
+
+struct br_cfm_cc_peer_status {
+ /* This CCM related status is based on the latest received CCM PDU. */
+ u8 port_tlv_value; /* Port Status TLV value */
+ u8 if_tlv_value; /* Interface Status TLV value */
+
+ /* CCM has not been received for 3.25 intervals */
+ u8 ccm_defect:1;
+
+ /* (RDI == 1) for last received CCM PDU */
+ u8 rdi:1;
+
+ /* Indications that a CCM PDU has been seen. */
+ u8 seen:1; /* CCM PDU received */
+ u8 tlv_seen:1; /* CCM PDU with TLV received */
+ /* CCM PDU with unexpected sequence number received */
+ u8 seq_unexp_seen:1;
+};
+
+struct br_cfm_mep {
+ /* list header of MEP instances */
+ struct hlist_node head;
+ u32 instance;
+ struct br_cfm_mep_create create;
+ struct br_cfm_mep_config config;
+ struct br_cfm_cc_config cc_config;
+ struct br_cfm_cc_ccm_tx_info cc_ccm_tx_info;
+ /* List of multiple peer MEPs */
+ struct hlist_head peer_mep_list;
+ struct net_bridge_port __rcu *b_port;
+ unsigned long ccm_tx_end;
+ struct delayed_work ccm_tx_dwork;
+ u32 ccm_tx_snumber;
+ u32 ccm_rx_snumber;
+ struct br_cfm_mep_status status;
+ bool rdi;
+ struct rcu_head rcu;
+};
+
+struct br_cfm_peer_mep {
+ struct hlist_node head;
+ struct br_cfm_mep *mep;
+ struct delayed_work ccm_rx_dwork;
+ u32 mepid;
+ struct br_cfm_cc_peer_status cc_status;
+ u32 ccm_rx_count_miss;
+ struct rcu_head rcu;
+};
+
+#endif /* _BR_PRIVATE_CFM_H_ */
diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h
index af0e9eff6549..2514954c1431 100644
--- a/net/bridge/br_private_mrp.h
+++ b/net/bridge/br_private_mrp.h
@@ -8,7 +8,7 @@
struct br_mrp {
/* list of mrp instances */
- struct list_head list;
+ struct hlist_node list;
struct net_bridge_port __rcu *p_port;
struct net_bridge_port __rcu *s_port;
@@ -72,8 +72,7 @@ int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp,
int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp,
u32 interval, u8 max_miss, u32 period,
bool monitor);
-int br_mrp_port_switchdev_set_state(struct net_bridge_port *p,
- enum br_mrp_port_state_type state);
+int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state);
int br_mrp_port_switchdev_set_role(struct net_bridge_port *p,
enum br_mrp_port_role_type role);
int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp,
@@ -88,4 +87,33 @@ int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp,
int br_mrp_ring_port_open(struct net_device *dev, u8 loc);
int br_mrp_in_port_open(struct net_device *dev, u8 loc);
+/* MRP protocol data units */
+struct br_mrp_tlv_hdr {
+ __u8 type;
+ __u8 length;
+};
+
+struct br_mrp_common_hdr {
+ __be16 seq_id;
+ __u8 domain[MRP_DOMAIN_UUID_LENGTH];
+};
+
+struct br_mrp_ring_test_hdr {
+ __be16 prio;
+ __u8 sa[ETH_ALEN];
+ __be16 port_role;
+ __be16 state;
+ __be16 transitions;
+ __be32 timestamp;
+} __attribute__((__packed__));
+
+struct br_mrp_in_test_hdr {
+ __be16 id;
+ __u8 sa[ETH_ALEN];
+ __be16 port_role;
+ __be16 state;
+ __be16 transitions;
+ __be32 timestamp;
+} __attribute__((__packed__));
+
#endif /* _BR_PRIVATE_MRP_H */
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index ee8780080be5..701cad646b20 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -140,7 +140,7 @@ static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
return err == -EOPNOTSUPP ? 0 : err;
}
-/* Returns a master vlan, if it didn't exist it gets created. In all cases a
+/* Returns a master vlan, if it didn't exist it gets created. In all cases
* a reference is taken to the master vlan before returning.
*/
static struct net_bridge_vlan *
@@ -266,11 +266,14 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
}
masterv = br_vlan_get_master(br, v->vid, extack);
- if (!masterv)
+ if (!masterv) {
+ err = -ENOMEM;
goto out_filt;
+ }
v->brvlan = masterv;
if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) {
- v->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats);
+ v->stats =
+ netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!v->stats) {
err = -ENOMEM;
goto out_filt;
@@ -421,7 +424,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
struct net_bridge_vlan_group *vg,
struct sk_buff *skb)
{
- struct br_vlan_stats *stats;
+ struct pcpu_sw_netstats *stats;
struct net_bridge_vlan *v;
u16 vid;
@@ -474,7 +477,7 @@ static bool __allowed_ingress(const struct net_bridge *br,
struct sk_buff *skb, u16 *vid,
u8 *state)
{
- struct br_vlan_stats *stats;
+ struct pcpu_sw_netstats *stats;
struct net_bridge_vlan *v;
bool tagged;
@@ -708,7 +711,7 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed,
if (!vlan)
return -ENOMEM;
- vlan->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats);
+ vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!vlan->stats) {
kfree(vlan);
return -ENOMEM;
@@ -853,15 +856,25 @@ EXPORT_SYMBOL_GPL(br_vlan_get_proto);
int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
{
+ struct switchdev_attr attr = {
+ .orig_dev = br->dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
+ .u.vlan_protocol = ntohs(proto),
+ };
int err = 0;
struct net_bridge_port *p;
struct net_bridge_vlan *vlan;
struct net_bridge_vlan_group *vg;
- __be16 oldproto;
+ __be16 oldproto = br->vlan_proto;
if (br->vlan_proto == proto)
return 0;
+ err = switchdev_port_attr_set(br->dev, &attr);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
/* Add VLANs for the new proto to the device filter. */
list_for_each_entry(p, &br->port_list, list) {
vg = nbp_vlan_group(p);
@@ -872,7 +885,6 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
}
}
- oldproto = br->vlan_proto;
br->vlan_proto = proto;
recalculate_group_addr(br);
@@ -888,6 +900,9 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
return 0;
err_filt:
+ attr.u.vlan_protocol = ntohs(oldproto);
+ switchdev_port_attr_set(br->dev, &attr);
+
list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist)
vlan_vid_del(p->dev, proto, vlan->vid);
@@ -1262,14 +1277,14 @@ void nbp_vlan_flush(struct net_bridge_port *port)
}
void br_vlan_get_stats(const struct net_bridge_vlan *v,
- struct br_vlan_stats *stats)
+ struct pcpu_sw_netstats *stats)
{
int i;
memset(stats, 0, sizeof(*stats));
for_each_possible_cpu(i) {
u64 rxpackets, rxbytes, txpackets, txbytes;
- struct br_vlan_stats *cpu_stats;
+ struct pcpu_sw_netstats *cpu_stats;
unsigned int start;
cpu_stats = per_cpu_ptr(v->stats, i);
@@ -1585,7 +1600,7 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event)
static bool br_vlan_stats_fill(struct sk_buff *skb,
const struct net_bridge_vlan *v)
{
- struct br_vlan_stats stats;
+ struct pcpu_sw_netstats stats;
struct nlattr *nest;
nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS);
@@ -1897,8 +1912,8 @@ out_err:
}
static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = {
- [BRIDGE_VLANDB_ENTRY_INFO] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct bridge_vlan_info) },
+ [BRIDGE_VLANDB_ENTRY_INFO] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)),
[BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 },
[BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 },
[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED },
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 5040fe43f4b4..ac5372121e60 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -17,7 +17,9 @@ config NFT_BRIDGE_META
config NFT_BRIDGE_REJECT
tristate "Netfilter nf_tables bridge reject support"
- depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6
+ depends on NFT_REJECT
+ depends on NF_REJECT_IPV4
+ depends on NF_REJECT_IPV6
help
Add support to reject packets.
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index 12a4f4d93681..3fda71a8579d 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -21,7 +21,7 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nat_info *info = par->targinfo;
- if (skb_ensure_writable(skb, ETH_ALEN))
+ if (skb_ensure_writable(skb, 0))
return EBT_DROP;
ether_addr_copy(eth_hdr(skb)->h_dest, info->mac);
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 0cad62a4052b..307790562b49 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -21,7 +21,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_redirect_info *info = par->targinfo;
- if (skb_ensure_writable(skb, ETH_ALEN))
+ if (skb_ensure_writable(skb, 0))
return EBT_DROP;
if (xt_hooknum(par) != NF_BR_BROUTING)
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 27443bf229a3..7dfbcdfc30e5 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -22,7 +22,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nat_info *info = par->targinfo;
- if (skb_ensure_writable(skb, ETH_ALEN * 2))
+ if (skb_ensure_writable(skb, 0))
return EBT_DROP;
ether_addr_copy(eth_hdr(skb)->h_source, info->mac);
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 0d6d20c9105e..8f68afda5f81 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -15,7 +15,6 @@
#include <linux/netfilter_bridge/ebt_stp.h>
#define BPDU_TYPE_CONFIG 0
-#define BPDU_TYPE_TCN 0x80
struct stp_header {
u8 dsap;
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index deae2c9a0f69..eba0efe64d05 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -39,30 +39,6 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
}
}
-static int nft_bridge_iphdr_validate(struct sk_buff *skb)
-{
- struct iphdr *iph;
- u32 len;
-
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- return 0;
-
- iph = ip_hdr(skb);
- if (iph->ihl < 5 || iph->version != 4)
- return 0;
-
- len = ntohs(iph->tot_len);
- if (skb->len < len)
- return 0;
- else if (len < (iph->ihl*4))
- return 0;
-
- if (!pskb_may_pull(skb, iph->ihl*4))
- return 0;
-
- return 1;
-}
-
/* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT)
* or the bridge port (NF_BRIDGE PREROUTING).
*/
@@ -72,29 +48,11 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net,
int hook)
{
struct sk_buff *nskb;
- struct iphdr *niph;
- const struct tcphdr *oth;
- struct tcphdr _oth;
- if (!nft_bridge_iphdr_validate(oldskb))
- return;
-
- oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
- if (!oth)
- return;
-
- nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
- LL_MAX_HEADER, GFP_ATOMIC);
+ nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook);
if (!nskb)
return;
- skb_reserve(nskb, LL_MAX_HEADER);
- niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
- net->ipv4.sysctl_ip_default_ttl);
- nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
- niph->tot_len = htons(nskb->len);
- ip_send_check(niph);
-
nft_reject_br_push_etherhdr(oldskb, nskb);
br_forward(br_port_get_rcu(dev), nskb, false, true);
@@ -106,139 +64,32 @@ static void nft_reject_br_send_v4_unreach(struct net *net,
int hook, u8 code)
{
struct sk_buff *nskb;
- struct iphdr *niph;
- struct icmphdr *icmph;
- unsigned int len;
- __wsum csum;
- u8 proto;
-
- if (!nft_bridge_iphdr_validate(oldskb))
- return;
-
- /* IP header checks: fragment. */
- if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
- return;
-
- /* RFC says return as much as we can without exceeding 576 bytes. */
- len = min_t(unsigned int, 536, oldskb->len);
-
- if (!pskb_may_pull(oldskb, len))
- return;
-
- if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len)))
- return;
-
- proto = ip_hdr(oldskb)->protocol;
-
- if (!skb_csum_unnecessary(oldskb) &&
- nf_reject_verify_csum(proto) &&
- nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto))
- return;
- nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) +
- LL_MAX_HEADER + len, GFP_ATOMIC);
+ nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code);
if (!nskb)
return;
- skb_reserve(nskb, LL_MAX_HEADER);
- niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
- net->ipv4.sysctl_ip_default_ttl);
-
- skb_reset_transport_header(nskb);
- icmph = skb_put_zero(nskb, sizeof(struct icmphdr));
- icmph->type = ICMP_DEST_UNREACH;
- icmph->code = code;
-
- skb_put_data(nskb, skb_network_header(oldskb), len);
-
- csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0);
- icmph->checksum = csum_fold(csum);
-
- niph->tot_len = htons(nskb->len);
- ip_send_check(niph);
-
nft_reject_br_push_etherhdr(oldskb, nskb);
br_forward(br_port_get_rcu(dev), nskb, false, true);
}
-static int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
-{
- struct ipv6hdr *hdr;
- u32 pkt_len;
-
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- return 0;
-
- hdr = ipv6_hdr(skb);
- if (hdr->version != 6)
- return 0;
-
- pkt_len = ntohs(hdr->payload_len);
- if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
- return 0;
-
- return 1;
-}
-
static void nft_reject_br_send_v6_tcp_reset(struct net *net,
struct sk_buff *oldskb,
const struct net_device *dev,
int hook)
{
struct sk_buff *nskb;
- const struct tcphdr *oth;
- struct tcphdr _oth;
- unsigned int otcplen;
- struct ipv6hdr *nip6h;
- if (!nft_bridge_ip6hdr_validate(oldskb))
- return;
-
- oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook);
- if (!oth)
- return;
-
- nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) +
- LL_MAX_HEADER, GFP_ATOMIC);
+ nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook);
if (!nskb)
return;
- skb_reserve(nskb, LL_MAX_HEADER);
- nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
- net->ipv6.devconf_all->hop_limit);
- nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen);
- nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
-
nft_reject_br_push_etherhdr(oldskb, nskb);
br_forward(br_port_get_rcu(dev), nskb, false, true);
}
-static bool reject6_br_csum_ok(struct sk_buff *skb, int hook)
-{
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- int thoff;
- __be16 fo;
- u8 proto = ip6h->nexthdr;
-
- if (skb_csum_unnecessary(skb))
- return true;
-
- if (ip6h->payload_len &&
- pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
- return false;
-
- ip6h = ipv6_hdr(skb);
- thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
- if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
- return false;
-
- if (!nf_reject_verify_csum(proto))
- return true;
-
- return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
-}
static void nft_reject_br_send_v6_unreach(struct net *net,
struct sk_buff *oldskb,
@@ -246,49 +97,11 @@ static void nft_reject_br_send_v6_unreach(struct net *net,
int hook, u8 code)
{
struct sk_buff *nskb;
- struct ipv6hdr *nip6h;
- struct icmp6hdr *icmp6h;
- unsigned int len;
-
- if (!nft_bridge_ip6hdr_validate(oldskb))
- return;
- /* Include "As much of invoking packet as possible without the ICMPv6
- * packet exceeding the minimum IPv6 MTU" in the ICMP payload.
- */
- len = min_t(unsigned int, 1220, oldskb->len);
-
- if (!pskb_may_pull(oldskb, len))
- return;
-
- if (!reject6_br_csum_ok(oldskb, hook))
- return;
-
- nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) +
- LL_MAX_HEADER + len, GFP_ATOMIC);
+ nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code);
if (!nskb)
return;
- skb_reserve(nskb, LL_MAX_HEADER);
- nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6,
- net->ipv6.devconf_all->hop_limit);
-
- skb_reset_transport_header(nskb);
- icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr));
- icmp6h->icmp6_type = ICMPV6_DEST_UNREACH;
- icmp6h->icmp6_code = code;
-
- skb_put_data(nskb, skb_network_header(oldskb), len);
- nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
-
- icmp6h->icmp6_cksum =
- csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr,
- nskb->len - sizeof(struct ipv6hdr),
- IPPROTO_ICMPV6,
- csum_partial(icmp6h,
- nskb->len - sizeof(struct ipv6hdr),
- 0));
-
nft_reject_br_push_etherhdr(oldskb, nskb);
br_forward(br_port_get_rcu(dev), nskb, false, true);
@@ -364,69 +177,13 @@ static int nft_reject_bridge_validate(const struct nft_ctx *ctx,
(1 << NF_BR_LOCAL_IN));
}
-static int nft_reject_bridge_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
-{
- struct nft_reject *priv = nft_expr_priv(expr);
- int icmp_code;
-
- if (tb[NFTA_REJECT_TYPE] == NULL)
- return -EINVAL;
-
- priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
- return -EINVAL;
-
- icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
- if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
- icmp_code > NFT_REJECT_ICMPX_MAX)
- return -EINVAL;
-
- priv->icmp_code = icmp_code;
- break;
- case NFT_REJECT_TCP_RST:
- break;
- default:
- return -EINVAL;
- }
- return 0;
-}
-
-static int nft_reject_bridge_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
-{
- const struct nft_reject *priv = nft_expr_priv(expr);
-
- if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type)))
- goto nla_put_failure;
-
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
- goto nla_put_failure;
- break;
- default:
- break;
- }
-
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
static struct nft_expr_type nft_reject_bridge_type;
static const struct nft_expr_ops nft_reject_bridge_ops = {
.type = &nft_reject_bridge_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
.eval = nft_reject_bridge_eval,
- .init = nft_reject_bridge_init,
- .dump = nft_reject_bridge_dump,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
.validate = nft_reject_bridge_validate,
};
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
index d0a4d0ac7045..9cef9496a707 100644
--- a/net/caif/cfsrvl.c
+++ b/net/caif/cfsrvl.c
@@ -21,7 +21,6 @@
#define SRVL_FLOW_OFF 0x81
#define SRVL_FLOW_ON 0x80
#define SRVL_SET_PIN 0x82
-#define SRVL_CTRL_PKT_SIZE 1
#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
diff --git a/net/can/Kconfig b/net/can/Kconfig
index 25436a715db3..7c9958df91d3 100644
--- a/net/can/Kconfig
+++ b/net/can/Kconfig
@@ -55,6 +55,21 @@ config CAN_GW
source "net/can/j1939/Kconfig"
+config CAN_ISOTP
+ tristate "ISO 15765-2:2016 CAN transport protocol"
+ help
+ CAN Transport Protocols offer support for segmented Point-to-Point
+ communication between CAN nodes via two defined CAN Identifiers.
+ As CAN frames can only transport a small amount of data bytes
+ (max. 8 bytes for 'classic' CAN and max. 64 bytes for CAN FD) this
+ segmentation is needed to transport longer Protocol Data Units (PDU)
+ as needed e.g. for vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN
+ traffic.
+ This protocol driver implements data transfers according to
+ ISO 15765-2:2016 for 'classic' CAN and CAN FD frame types.
+ If you want to perform automotive vehicle diagnostic services (UDS),
+ say 'y'.
+
source "drivers/net/can/Kconfig"
endif
diff --git a/net/can/Makefile b/net/can/Makefile
index 08bd217fc051..58f2c31c1ef3 100644
--- a/net/can/Makefile
+++ b/net/can/Makefile
@@ -17,3 +17,6 @@ obj-$(CONFIG_CAN_GW) += can-gw.o
can-gw-y := gw.o
obj-$(CONFIG_CAN_J1939) += j1939/
+
+obj-$(CONFIG_CAN_ISOTP) += can-isotp.o
+can-isotp-y := isotp.o
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 5c06404bdf3e..837bb8af0ec3 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -338,7 +338,7 @@ static unsigned int effhash(canid_t can_id)
* can_rcv_list_find - determine optimal filterlist inside device filter struct
* @can_id: pointer to CAN identifier of a given can_filter
* @mask: pointer to CAN mask of a given can_filter
- * @d: pointer to the device filter struct
+ * @dev_rcv_lists: pointer to the device filter struct
*
* Description:
* Returns the optimal filterlist to reduce the filter handling in the
@@ -358,7 +358,7 @@ static unsigned int effhash(canid_t can_id)
*
* Return:
* Pointer to optimal filterlist for the given can_id/mask pair.
- * Constistency checked mask.
+ * Consistency checked mask.
* Reduced can_id to have a preprocessed filter compare value.
*/
static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask,
@@ -411,7 +411,7 @@ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask,
/**
* can_rx_register - subscribe CAN frames from a specific interface
* @net: the applicable net namespace
- * @dev: pointer to netdevice (NULL => subcribe from 'all' CAN devices list)
+ * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list)
* @can_id: CAN identifier (see description)
* @mask: CAN mask (see description)
* @func: callback function on filter match
@@ -541,10 +541,13 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
/* Check for bugs in CAN protocol implementations using af_can.c:
* 'rcv' will be NULL if no matching list item was found for removal.
+ * As this case may potentially happen when closing a socket while
+ * the notifier for removing the CAN netdev is running we just print
+ * a warning here.
*/
if (!rcv) {
- WARN(1, "BUG: receive list entry not found for dev %s, id %03X, mask %03X\n",
- DNAME(dev), can_id, mask);
+ pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n",
+ DNAME(dev), can_id, mask);
goto out;
}
@@ -677,16 +680,25 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev,
{
struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
- if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU ||
- cfd->len > CAN_MAX_DLEN)) {
- pr_warn_once("PF_CAN: dropped non conform CAN skbuf: dev type %d, len %d, datalen %d\n",
+ if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU)) {
+ pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n",
+ dev->type, skb->len);
+ goto free_skb;
+ }
+
+ /* This check is made separately since cfd->len would be uninitialized if skb->len = 0. */
+ if (unlikely(cfd->len > CAN_MAX_DLEN)) {
+ pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d, datalen %d\n",
dev->type, skb->len, cfd->len);
- kfree_skb(skb);
- return NET_RX_DROP;
+ goto free_skb;
}
can_receive(skb, dev);
return NET_RX_SUCCESS;
+
+free_skb:
+ kfree_skb(skb);
+ return NET_RX_DROP;
}
static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -694,16 +706,25 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
{
struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
- if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU ||
- cfd->len > CANFD_MAX_DLEN)) {
- pr_warn_once("PF_CAN: dropped non conform CAN FD skbuf: dev type %d, len %d, datalen %d\n",
+ if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU)) {
+ pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n",
+ dev->type, skb->len);
+ goto free_skb;
+ }
+
+ /* This check is made separately since cfd->len would be uninitialized if skb->len = 0. */
+ if (unlikely(cfd->len > CANFD_MAX_DLEN)) {
+ pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d, datalen %d\n",
dev->type, skb->len, cfd->len);
- kfree_skb(skb);
- return NET_RX_DROP;
+ goto free_skb;
}
can_receive(skb, dev);
return NET_RX_SUCCESS;
+
+free_skb:
+ kfree_skb(skb);
+ return NET_RX_DROP;
}
/* af_can protocol functions */
@@ -870,12 +891,12 @@ static __init int can_init(void)
int err;
/* check for correct padding to be able to use the structs similarly */
- BUILD_BUG_ON(offsetof(struct can_frame, can_dlc) !=
+ BUILD_BUG_ON(offsetof(struct can_frame, len) !=
offsetof(struct canfd_frame, len) ||
offsetof(struct can_frame, data) !=
offsetof(struct canfd_frame, data));
- pr_info("can: controller area network core (" CAN_VERSION_STRING ")\n");
+ pr_info("can: controller area network core\n");
rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver),
0, 0, NULL);
diff --git a/net/can/bcm.c b/net/can/bcm.c
index d14ea12affb1..0e5c37be4a2b 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
* bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
*
@@ -81,8 +81,6 @@
(CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
(CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
-#define CAN_BCM_VERSION "20170425"
-
MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
@@ -1696,7 +1694,7 @@ static int __init bcm_module_init(void)
{
int err;
- pr_info("can: broadcast manager protocol (rev " CAN_BCM_VERSION " t)\n");
+ pr_info("can: broadcast manager protocol\n");
err = can_proto_register(&bcm_can_proto);
if (err < 0) {
diff --git a/net/can/gw.c b/net/can/gw.c
index 65d60c93af29..8598d9da0e5f 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/* gw.c - CAN frame Gateway/Router/Bridge with netlink interface
*
* Copyright (c) 2019 Volkswagen Group Electronic Research
@@ -59,7 +59,6 @@
#include <net/net_namespace.h>
#include <net/sock.h>
-#define CAN_GW_VERSION "20190810"
#define CAN_GW_NAME "can-gw"
MODULE_DESCRIPTION("PF_CAN netlink gateway");
@@ -200,6 +199,68 @@ static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod)
memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN);
}
+/* retrieve valid CC DLC value and store it into 'len' */
+static void mod_retrieve_ccdlc(struct canfd_frame *cf)
+{
+ struct can_frame *ccf = (struct can_frame *)cf;
+
+ /* len8_dlc is only valid if len == CAN_MAX_DLEN */
+ if (ccf->len != CAN_MAX_DLEN)
+ return;
+
+ /* do we have a valid len8_dlc value from 9 .. 15 ? */
+ if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC)
+ ccf->len = ccf->len8_dlc;
+}
+
+/* convert valid CC DLC value in 'len' into struct can_frame elements */
+static void mod_store_ccdlc(struct canfd_frame *cf)
+{
+ struct can_frame *ccf = (struct can_frame *)cf;
+
+ /* clear potential leftovers */
+ ccf->len8_dlc = 0;
+
+ /* plain data length 0 .. 8 - that was easy */
+ if (ccf->len <= CAN_MAX_DLEN)
+ return;
+
+ /* potentially broken values are catched in can_can_gw_rcv() */
+ if (ccf->len > CAN_MAX_RAW_DLC)
+ return;
+
+ /* we have a valid dlc value from 9 .. 15 in ccf->len */
+ ccf->len8_dlc = ccf->len;
+ ccf->len = CAN_MAX_DLEN;
+}
+
+static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_retrieve_ccdlc(cf);
+ mod_and_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_retrieve_ccdlc(cf);
+ mod_or_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_retrieve_ccdlc(cf);
+ mod_xor_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_set_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
static void canframecpy(struct canfd_frame *dst, struct can_frame *src)
{
/* Copy the struct members separately to ensure that no uninitialized
@@ -208,7 +269,7 @@ static void canframecpy(struct canfd_frame *dst, struct can_frame *src)
*/
dst->can_id = src->can_id;
- dst->len = src->can_dlc;
+ dst->len = src->len;
*(u64 *)dst->data = *(u64 *)src->data;
}
@@ -843,8 +904,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
if (mb.modtype & CGW_MOD_ID)
mod->modfunc[modidx++] = mod_and_id;
- if (mb.modtype & CGW_MOD_LEN)
- mod->modfunc[modidx++] = mod_and_len;
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_and_ccdlc;
if (mb.modtype & CGW_MOD_DATA)
mod->modfunc[modidx++] = mod_and_data;
@@ -859,8 +920,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
if (mb.modtype & CGW_MOD_ID)
mod->modfunc[modidx++] = mod_or_id;
- if (mb.modtype & CGW_MOD_LEN)
- mod->modfunc[modidx++] = mod_or_len;
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_or_ccdlc;
if (mb.modtype & CGW_MOD_DATA)
mod->modfunc[modidx++] = mod_or_data;
@@ -875,8 +936,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
if (mb.modtype & CGW_MOD_ID)
mod->modfunc[modidx++] = mod_xor_id;
- if (mb.modtype & CGW_MOD_LEN)
- mod->modfunc[modidx++] = mod_xor_len;
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_xor_ccdlc;
if (mb.modtype & CGW_MOD_DATA)
mod->modfunc[modidx++] = mod_xor_data;
@@ -891,8 +952,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
if (mb.modtype & CGW_MOD_ID)
mod->modfunc[modidx++] = mod_set_id;
- if (mb.modtype & CGW_MOD_LEN)
- mod->modfunc[modidx++] = mod_set_len;
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_set_ccdlc;
if (mb.modtype & CGW_MOD_DATA)
mod->modfunc[modidx++] = mod_set_data;
@@ -1194,8 +1255,7 @@ static __init int cgw_module_init(void)
/* sanitize given module parameter */
max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS);
- pr_info("can: netlink gateway (rev " CAN_GW_VERSION ") max_hops=%d\n",
- max_hops);
+ pr_info("can: netlink gateway - max_hops=%d\n", max_hops);
ret = register_pernet_subsys(&cangw_pernet_ops);
if (ret)
diff --git a/net/can/isotp.c b/net/can/isotp.c
new file mode 100644
index 000000000000..3ef7f78e553b
--- /dev/null
+++ b/net/can/isotp.c
@@ -0,0 +1,1446 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN
+ *
+ * This implementation does not provide ISO-TP specific return values to the
+ * userspace.
+ *
+ * - RX path timeout of data reception leads to -ETIMEDOUT
+ * - RX path SN mismatch leads to -EILSEQ
+ * - RX path data reception with wrong padding leads to -EBADMSG
+ * - TX path flowcontrol reception timeout leads to -ECOMM
+ * - TX path flowcontrol reception overflow leads to -EMSGSIZE
+ * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG
+ * - when a transfer (tx) is on the run the next write() blocks until it's done
+ * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent
+ * - as we have static buffers the check whether the PDU fits into the buffer
+ * is done at FF reception time (no support for sending 'wait frames')
+ * - take care of the tx-queue-len as traffic shaping is still on the TODO list
+ *
+ * Copyright (c) 2020 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/hrtimer.h>
+#include <linux/wait.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/can/isotp.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+
+MODULE_DESCRIPTION("PF_CAN isotp 15765-2:2016 protocol");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>");
+MODULE_ALIAS("can-proto-6");
+
+#define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \
+ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
+ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
+
+/* ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can
+ * take full 32 bit values (4 Gbyte). We would need some good concept to handle
+ * this between user space and kernel space. For now increase the static buffer
+ * to something about 8 kbyte to be able to test this new functionality.
+ */
+#define MAX_MSG_LENGTH 8200
+
+/* N_PCI type values in bits 7-4 of N_PCI bytes */
+#define N_PCI_SF 0x00 /* single frame */
+#define N_PCI_FF 0x10 /* first frame */
+#define N_PCI_CF 0x20 /* consecutive frame */
+#define N_PCI_FC 0x30 /* flow control */
+
+#define N_PCI_SZ 1 /* size of the PCI byte #1 */
+#define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */
+#define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */
+#define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */
+#define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */
+#define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */
+
+#define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA)
+
+/* Flow Status given in FC frame */
+#define ISOTP_FC_CTS 0 /* clear to send */
+#define ISOTP_FC_WT 1 /* wait */
+#define ISOTP_FC_OVFLW 2 /* overflow */
+
+enum {
+ ISOTP_IDLE = 0,
+ ISOTP_WAIT_FIRST_FC,
+ ISOTP_WAIT_FC,
+ ISOTP_WAIT_DATA,
+ ISOTP_SENDING
+};
+
+struct tpcon {
+ int idx;
+ int len;
+ u8 state;
+ u8 bs;
+ u8 sn;
+ u8 ll_dl;
+ u8 buf[MAX_MSG_LENGTH + 1];
+};
+
+struct isotp_sock {
+ struct sock sk;
+ int bound;
+ int ifindex;
+ canid_t txid;
+ canid_t rxid;
+ ktime_t tx_gap;
+ ktime_t lastrxcf_tstamp;
+ struct hrtimer rxtimer, txtimer;
+ struct can_isotp_options opt;
+ struct can_isotp_fc_options rxfc, txfc;
+ struct can_isotp_ll_options ll;
+ u32 force_tx_stmin;
+ u32 force_rx_stmin;
+ struct tpcon rx, tx;
+ struct notifier_block notifier;
+ wait_queue_head_t wait;
+};
+
+static inline struct isotp_sock *isotp_sk(const struct sock *sk)
+{
+ return (struct isotp_sock *)sk;
+}
+
+static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer)
+{
+ struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
+ rxtimer);
+ struct sock *sk = &so->sk;
+
+ if (so->rx.state == ISOTP_WAIT_DATA) {
+ /* we did not get new data frames in time */
+
+ /* report 'connection timed out' */
+ sk->sk_err = ETIMEDOUT;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+
+ /* reset rx state */
+ so->rx.state = ISOTP_IDLE;
+ }
+
+ return HRTIMER_NORESTART;
+}
+
+static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus)
+{
+ struct net_device *dev;
+ struct sk_buff *nskb;
+ struct canfd_frame *ncf;
+ struct isotp_sock *so = isotp_sk(sk);
+ int can_send_ret;
+
+ nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any());
+ if (!nskb)
+ return 1;
+
+ dev = dev_get_by_index(sock_net(sk), so->ifindex);
+ if (!dev) {
+ kfree_skb(nskb);
+ return 1;
+ }
+
+ can_skb_reserve(nskb);
+ can_skb_prv(nskb)->ifindex = dev->ifindex;
+ can_skb_prv(nskb)->skbcnt = 0;
+
+ nskb->dev = dev;
+ can_skb_set_owner(nskb, sk);
+ ncf = (struct canfd_frame *)nskb->data;
+ skb_put(nskb, so->ll.mtu);
+
+ /* create & send flow control reply */
+ ncf->can_id = so->txid;
+
+ if (so->opt.flags & CAN_ISOTP_TX_PADDING) {
+ memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN);
+ ncf->len = CAN_MAX_DLEN;
+ } else {
+ ncf->len = ae + FC_CONTENT_SZ;
+ }
+
+ ncf->data[ae] = N_PCI_FC | flowstatus;
+ ncf->data[ae + 1] = so->rxfc.bs;
+ ncf->data[ae + 2] = so->rxfc.stmin;
+
+ if (ae)
+ ncf->data[0] = so->opt.ext_address;
+
+ if (so->ll.mtu == CANFD_MTU)
+ ncf->flags = so->ll.tx_flags;
+
+ can_send_ret = can_send(nskb, 1);
+ if (can_send_ret)
+ pr_notice_once("can-isotp: %s: can_send_ret %d\n",
+ __func__, can_send_ret);
+
+ dev_put(dev);
+
+ /* reset blocksize counter */
+ so->rx.bs = 0;
+
+ /* reset last CF frame rx timestamp for rx stmin enforcement */
+ so->lastrxcf_tstamp = ktime_set(0, 0);
+
+ /* start rx timeout watchdog */
+ hrtimer_start(&so->rxtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
+ return 0;
+}
+
+static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb;
+
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can));
+
+ memset(addr, 0, sizeof(*addr));
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = skb->dev->ifindex;
+
+ if (sock_queue_rcv_skb(sk, skb) < 0)
+ kfree_skb(skb);
+}
+
+static u8 padlen(u8 datalen)
+{
+ static const u8 plen[] = {
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */
+ 12, 12, 12, 12, /* 9 - 12 */
+ 16, 16, 16, 16, /* 13 - 16 */
+ 20, 20, 20, 20, /* 17 - 20 */
+ 24, 24, 24, 24, /* 21 - 24 */
+ 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */
+ 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */
+ 48, 48, 48, 48, 48, 48, 48, 48 /* 41 - 48 */
+ };
+
+ if (datalen > 48)
+ return 64;
+
+ return plen[datalen];
+}
+
+/* check for length optimization and return 1/true when the check fails */
+static int check_optimized(struct canfd_frame *cf, int start_index)
+{
+ /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the
+ * padding would start at this point. E.g. if the padding would
+ * start at cf.data[7] cf->len has to be 7 to be optimal.
+ * Note: The data[] index starts with zero.
+ */
+ if (cf->len <= CAN_MAX_DLEN)
+ return (cf->len != start_index);
+
+ /* This relation is also valid in the non-linear DLC range, where
+ * we need to take care of the minimal next possible CAN_DL.
+ * The correct check would be (padlen(cf->len) != padlen(start_index)).
+ * But as cf->len can only take discrete values from 12, .., 64 at this
+ * point the padlen(cf->len) is always equal to cf->len.
+ */
+ return (cf->len != padlen(start_index));
+}
+
+/* check padding and return 1/true when the check fails */
+static int check_pad(struct isotp_sock *so, struct canfd_frame *cf,
+ int start_index, u8 content)
+{
+ int i;
+
+ /* no RX_PADDING value => check length of optimized frame length */
+ if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) {
+ if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN)
+ return check_optimized(cf, start_index);
+
+ /* no valid test against empty value => ignore frame */
+ return 1;
+ }
+
+ /* check datalength of correctly padded CAN frame */
+ if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) &&
+ cf->len != padlen(cf->len))
+ return 1;
+
+ /* check padding content */
+ if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) {
+ for (i = start_index; i < cf->len; i++)
+ if (cf->data[i] != content)
+ return 1;
+ }
+ return 0;
+}
+
+static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
+{
+ struct sock *sk = &so->sk;
+
+ if (so->tx.state != ISOTP_WAIT_FC &&
+ so->tx.state != ISOTP_WAIT_FIRST_FC)
+ return 0;
+
+ hrtimer_cancel(&so->txtimer);
+
+ if ((cf->len < ae + FC_CONTENT_SZ) ||
+ ((so->opt.flags & ISOTP_CHECK_PADDING) &&
+ check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) {
+ /* malformed PDU - report 'not a data message' */
+ sk->sk_err = EBADMSG;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+ return 1;
+ }
+
+ /* get communication parameters only from the first FC frame */
+ if (so->tx.state == ISOTP_WAIT_FIRST_FC) {
+ so->txfc.bs = cf->data[ae + 1];
+ so->txfc.stmin = cf->data[ae + 2];
+
+ /* fix wrong STmin values according spec */
+ if (so->txfc.stmin > 0x7F &&
+ (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9))
+ so->txfc.stmin = 0x7F;
+
+ so->tx_gap = ktime_set(0, 0);
+ /* add transmission time for CAN frame N_As */
+ so->tx_gap = ktime_add_ns(so->tx_gap, so->opt.frame_txtime);
+ /* add waiting time for consecutive frames N_Cs */
+ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN)
+ so->tx_gap = ktime_add_ns(so->tx_gap,
+ so->force_tx_stmin);
+ else if (so->txfc.stmin < 0x80)
+ so->tx_gap = ktime_add_ns(so->tx_gap,
+ so->txfc.stmin * 1000000);
+ else
+ so->tx_gap = ktime_add_ns(so->tx_gap,
+ (so->txfc.stmin - 0xF0)
+ * 100000);
+ so->tx.state = ISOTP_WAIT_FC;
+ }
+
+ switch (cf->data[ae] & 0x0F) {
+ case ISOTP_FC_CTS:
+ so->tx.bs = 0;
+ so->tx.state = ISOTP_SENDING;
+ /* start cyclic timer for sending CF frame */
+ hrtimer_start(&so->txtimer, so->tx_gap,
+ HRTIMER_MODE_REL_SOFT);
+ break;
+
+ case ISOTP_FC_WT:
+ /* start timer to wait for next FC frame */
+ hrtimer_start(&so->txtimer, ktime_set(1, 0),
+ HRTIMER_MODE_REL_SOFT);
+ break;
+
+ case ISOTP_FC_OVFLW:
+ /* overflow on receiver side - report 'message too long' */
+ sk->sk_err = EMSGSIZE;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ fallthrough;
+
+ default:
+ /* stop this tx job */
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+ }
+ return 0;
+}
+
+static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen,
+ struct sk_buff *skb, int len)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+ struct sk_buff *nskb;
+
+ hrtimer_cancel(&so->rxtimer);
+ so->rx.state = ISOTP_IDLE;
+
+ if (!len || len > cf->len - pcilen)
+ return 1;
+
+ if ((so->opt.flags & ISOTP_CHECK_PADDING) &&
+ check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) {
+ /* malformed PDU - report 'not a data message' */
+ sk->sk_err = EBADMSG;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ return 1;
+ }
+
+ nskb = alloc_skb(len, gfp_any());
+ if (!nskb)
+ return 1;
+
+ memcpy(skb_put(nskb, len), &cf->data[pcilen], len);
+
+ nskb->tstamp = skb->tstamp;
+ nskb->dev = skb->dev;
+ isotp_rcv_skb(nskb, sk);
+ return 0;
+}
+
+static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+ int i;
+ int off;
+ int ff_pci_sz;
+
+ hrtimer_cancel(&so->rxtimer);
+ so->rx.state = ISOTP_IDLE;
+
+ /* get the used sender LL_DL from the (first) CAN frame data length */
+ so->rx.ll_dl = padlen(cf->len);
+
+ /* the first frame has to use the entire frame up to LL_DL length */
+ if (cf->len != so->rx.ll_dl)
+ return 1;
+
+ /* get the FF_DL */
+ so->rx.len = (cf->data[ae] & 0x0F) << 8;
+ so->rx.len += cf->data[ae + 1];
+
+ /* Check for FF_DL escape sequence supporting 32 bit PDU length */
+ if (so->rx.len) {
+ ff_pci_sz = FF_PCI_SZ12;
+ } else {
+ /* FF_DL = 0 => get real length from next 4 bytes */
+ so->rx.len = cf->data[ae + 2] << 24;
+ so->rx.len += cf->data[ae + 3] << 16;
+ so->rx.len += cf->data[ae + 4] << 8;
+ so->rx.len += cf->data[ae + 5];
+ ff_pci_sz = FF_PCI_SZ32;
+ }
+
+ /* take care of a potential SF_DL ESC offset for TX_DL > 8 */
+ off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0;
+
+ if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl)
+ return 1;
+
+ if (so->rx.len > MAX_MSG_LENGTH) {
+ /* send FC frame with overflow status */
+ isotp_send_fc(sk, ae, ISOTP_FC_OVFLW);
+ return 1;
+ }
+
+ /* copy the first received data bytes */
+ so->rx.idx = 0;
+ for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++)
+ so->rx.buf[so->rx.idx++] = cf->data[i];
+
+ /* initial setup for this pdu reception */
+ so->rx.sn = 1;
+ so->rx.state = ISOTP_WAIT_DATA;
+
+ /* no creation of flow control frames */
+ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE)
+ return 0;
+
+ /* send our first FC frame */
+ isotp_send_fc(sk, ae, ISOTP_FC_CTS);
+ return 0;
+}
+
+static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae,
+ struct sk_buff *skb)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+ struct sk_buff *nskb;
+ int i;
+
+ if (so->rx.state != ISOTP_WAIT_DATA)
+ return 0;
+
+ /* drop if timestamp gap is less than force_rx_stmin nano secs */
+ if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) {
+ if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) <
+ so->force_rx_stmin)
+ return 0;
+
+ so->lastrxcf_tstamp = skb->tstamp;
+ }
+
+ hrtimer_cancel(&so->rxtimer);
+
+ /* CFs are never longer than the FF */
+ if (cf->len > so->rx.ll_dl)
+ return 1;
+
+ /* CFs have usually the LL_DL length */
+ if (cf->len < so->rx.ll_dl) {
+ /* this is only allowed for the last CF */
+ if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ)
+ return 1;
+ }
+
+ if ((cf->data[ae] & 0x0F) != so->rx.sn) {
+ /* wrong sn detected - report 'illegal byte sequence' */
+ sk->sk_err = EILSEQ;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+
+ /* reset rx state */
+ so->rx.state = ISOTP_IDLE;
+ return 1;
+ }
+ so->rx.sn++;
+ so->rx.sn %= 16;
+
+ for (i = ae + N_PCI_SZ; i < cf->len; i++) {
+ so->rx.buf[so->rx.idx++] = cf->data[i];
+ if (so->rx.idx >= so->rx.len)
+ break;
+ }
+
+ if (so->rx.idx >= so->rx.len) {
+ /* we are done */
+ so->rx.state = ISOTP_IDLE;
+
+ if ((so->opt.flags & ISOTP_CHECK_PADDING) &&
+ check_pad(so, cf, i + 1, so->opt.rxpad_content)) {
+ /* malformed PDU - report 'not a data message' */
+ sk->sk_err = EBADMSG;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ return 1;
+ }
+
+ nskb = alloc_skb(so->rx.len, gfp_any());
+ if (!nskb)
+ return 1;
+
+ memcpy(skb_put(nskb, so->rx.len), so->rx.buf,
+ so->rx.len);
+
+ nskb->tstamp = skb->tstamp;
+ nskb->dev = skb->dev;
+ isotp_rcv_skb(nskb, sk);
+ return 0;
+ }
+
+ /* perform blocksize handling, if enabled */
+ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) {
+ /* start rx timeout watchdog */
+ hrtimer_start(&so->rxtimer, ktime_set(1, 0),
+ HRTIMER_MODE_REL_SOFT);
+ return 0;
+ }
+
+ /* no creation of flow control frames */
+ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE)
+ return 0;
+
+ /* we reached the specified blocksize so->rxfc.bs */
+ isotp_send_fc(sk, ae, ISOTP_FC_CTS);
+ return 0;
+}
+
+static void isotp_rcv(struct sk_buff *skb, void *data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct canfd_frame *cf;
+ int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
+ u8 n_pci_type, sf_dl;
+
+ /* Strictly receive only frames with the configured MTU size
+ * => clear separation of CAN2.0 / CAN FD transport channels
+ */
+ if (skb->len != so->ll.mtu)
+ return;
+
+ cf = (struct canfd_frame *)skb->data;
+
+ /* if enabled: check reception of my configured extended address */
+ if (ae && cf->data[0] != so->opt.rx_ext_address)
+ return;
+
+ n_pci_type = cf->data[ae] & 0xF0;
+
+ if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) {
+ /* check rx/tx path half duplex expectations */
+ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) ||
+ (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC))
+ return;
+ }
+
+ switch (n_pci_type) {
+ case N_PCI_FC:
+ /* tx path: flow control frame containing the FC parameters */
+ isotp_rcv_fc(so, cf, ae);
+ break;
+
+ case N_PCI_SF:
+ /* rx path: single frame
+ *
+ * As we do not have a rx.ll_dl configuration, we can only test
+ * if the CAN frames payload length matches the LL_DL == 8
+ * requirements - no matter if it's CAN 2.0 or CAN FD
+ */
+
+ /* get the SF_DL from the N_PCI byte */
+ sf_dl = cf->data[ae] & 0x0F;
+
+ if (cf->len <= CAN_MAX_DLEN) {
+ isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl);
+ } else {
+ if (skb->len == CANFD_MTU) {
+ /* We have a CAN FD frame and CAN_DL is greater than 8:
+ * Only frames with the SF_DL == 0 ESC value are valid.
+ *
+ * If so take care of the increased SF PCI size
+ * (SF_PCI_SZ8) to point to the message content behind
+ * the extended SF PCI info and get the real SF_DL
+ * length value from the formerly first data byte.
+ */
+ if (sf_dl == 0)
+ isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb,
+ cf->data[SF_PCI_SZ4 + ae]);
+ }
+ }
+ break;
+
+ case N_PCI_FF:
+ /* rx path: first frame */
+ isotp_rcv_ff(sk, cf, ae);
+ break;
+
+ case N_PCI_CF:
+ /* rx path: consecutive frame */
+ isotp_rcv_cf(sk, cf, ae, skb);
+ break;
+ }
+}
+
+static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so,
+ int ae, int off)
+{
+ int pcilen = N_PCI_SZ + ae + off;
+ int space = so->tx.ll_dl - pcilen;
+ int num = min_t(int, so->tx.len - so->tx.idx, space);
+ int i;
+
+ cf->can_id = so->txid;
+ cf->len = num + pcilen;
+
+ if (num < space) {
+ if (so->opt.flags & CAN_ISOTP_TX_PADDING) {
+ /* user requested padding */
+ cf->len = padlen(cf->len);
+ memset(cf->data, so->opt.txpad_content, cf->len);
+ } else if (cf->len > CAN_MAX_DLEN) {
+ /* mandatory padding for CAN FD frames */
+ cf->len = padlen(cf->len);
+ memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT,
+ cf->len);
+ }
+ }
+
+ for (i = 0; i < num; i++)
+ cf->data[pcilen + i] = so->tx.buf[so->tx.idx++];
+
+ if (ae)
+ cf->data[0] = so->opt.ext_address;
+}
+
+static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so,
+ int ae)
+{
+ int i;
+ int ff_pci_sz;
+
+ cf->can_id = so->txid;
+ cf->len = so->tx.ll_dl;
+ if (ae)
+ cf->data[0] = so->opt.ext_address;
+
+ /* create N_PCI bytes with 12/32 bit FF_DL data length */
+ if (so->tx.len > 4095) {
+ /* use 32 bit FF_DL notation */
+ cf->data[ae] = N_PCI_FF;
+ cf->data[ae + 1] = 0;
+ cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU;
+ cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU;
+ cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU;
+ cf->data[ae + 5] = (u8)so->tx.len & 0xFFU;
+ ff_pci_sz = FF_PCI_SZ32;
+ } else {
+ /* use 12 bit FF_DL notation */
+ cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF;
+ cf->data[ae + 1] = (u8)so->tx.len & 0xFFU;
+ ff_pci_sz = FF_PCI_SZ12;
+ }
+
+ /* add first data bytes depending on ae */
+ for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++)
+ cf->data[i] = so->tx.buf[so->tx.idx++];
+
+ so->tx.sn = 1;
+ so->tx.state = ISOTP_WAIT_FIRST_FC;
+}
+
+static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer)
+{
+ struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
+ txtimer);
+ struct sock *sk = &so->sk;
+ struct sk_buff *skb;
+ struct net_device *dev;
+ struct canfd_frame *cf;
+ enum hrtimer_restart restart = HRTIMER_NORESTART;
+ int can_send_ret;
+ int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
+
+ switch (so->tx.state) {
+ case ISOTP_WAIT_FC:
+ case ISOTP_WAIT_FIRST_FC:
+
+ /* we did not get any flow control frame in time */
+
+ /* report 'communication error on send' */
+ sk->sk_err = ECOMM;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+
+ /* reset tx state */
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+ break;
+
+ case ISOTP_SENDING:
+
+ /* push out the next segmented pdu */
+ dev = dev_get_by_index(sock_net(sk), so->ifindex);
+ if (!dev)
+ break;
+
+isotp_tx_burst:
+ skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv),
+ GFP_ATOMIC);
+ if (!skb) {
+ dev_put(dev);
+ break;
+ }
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+
+ cf = (struct canfd_frame *)skb->data;
+ skb_put(skb, so->ll.mtu);
+
+ /* create consecutive frame */
+ isotp_fill_dataframe(cf, so, ae, 0);
+
+ /* place consecutive frame N_PCI in appropriate index */
+ cf->data[ae] = N_PCI_CF | so->tx.sn++;
+ so->tx.sn %= 16;
+ so->tx.bs++;
+
+ if (so->ll.mtu == CANFD_MTU)
+ cf->flags = so->ll.tx_flags;
+
+ skb->dev = dev;
+ can_skb_set_owner(skb, sk);
+
+ can_send_ret = can_send(skb, 1);
+ if (can_send_ret)
+ pr_notice_once("can-isotp: %s: can_send_ret %d\n",
+ __func__, can_send_ret);
+
+ if (so->tx.idx >= so->tx.len) {
+ /* we are done */
+ so->tx.state = ISOTP_IDLE;
+ dev_put(dev);
+ wake_up_interruptible(&so->wait);
+ break;
+ }
+
+ if (so->txfc.bs && so->tx.bs >= so->txfc.bs) {
+ /* stop and wait for FC */
+ so->tx.state = ISOTP_WAIT_FC;
+ dev_put(dev);
+ hrtimer_set_expires(&so->txtimer,
+ ktime_add(ktime_get(),
+ ktime_set(1, 0)));
+ restart = HRTIMER_RESTART;
+ break;
+ }
+
+ /* no gap between data frames needed => use burst mode */
+ if (!so->tx_gap)
+ goto isotp_tx_burst;
+
+ /* start timer to send next data frame with correct delay */
+ dev_put(dev);
+ hrtimer_set_expires(&so->txtimer,
+ ktime_add(ktime_get(), so->tx_gap));
+ restart = HRTIMER_RESTART;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ return restart;
+}
+
+static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct sk_buff *skb;
+ struct net_device *dev;
+ struct canfd_frame *cf;
+ int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
+ int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0;
+ int off;
+ int err;
+
+ if (!so->bound)
+ return -EADDRNOTAVAIL;
+
+ /* we do not support multiple buffers - for now */
+ if (so->tx.state != ISOTP_IDLE || wq_has_sleeper(&so->wait)) {
+ if (msg->msg_flags & MSG_DONTWAIT)
+ return -EAGAIN;
+
+ /* wait for complete transmission of current pdu */
+ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+ }
+
+ if (!size || size > MAX_MSG_LENGTH)
+ return -EINVAL;
+
+ /* take care of a potential SF_DL ESC offset for TX_DL > 8 */
+ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0;
+
+ /* does the given data fit into a single frame for SF_BROADCAST? */
+ if ((so->opt.flags & CAN_ISOTP_SF_BROADCAST) &&
+ (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off))
+ return -EINVAL;
+
+ err = memcpy_from_msg(so->tx.buf, msg, size);
+ if (err < 0)
+ return err;
+
+ dev = dev_get_by_index(sock_net(sk), so->ifindex);
+ if (!dev)
+ return -ENXIO;
+
+ skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv),
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb) {
+ dev_put(dev);
+ return err;
+ }
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+
+ so->tx.state = ISOTP_SENDING;
+ so->tx.len = size;
+ so->tx.idx = 0;
+
+ cf = (struct canfd_frame *)skb->data;
+ skb_put(skb, so->ll.mtu);
+
+ /* check for single frame transmission depending on TX_DL */
+ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) {
+ /* The message size generally fits into a SingleFrame - good.
+ *
+ * SF_DL ESC offset optimization:
+ *
+ * When TX_DL is greater 8 but the message would still fit
+ * into a 8 byte CAN frame, we can omit the offset.
+ * This prevents a protocol caused length extension from
+ * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling.
+ */
+ if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae)
+ off = 0;
+
+ isotp_fill_dataframe(cf, so, ae, off);
+
+ /* place single frame N_PCI w/o length in appropriate index */
+ cf->data[ae] = N_PCI_SF;
+
+ /* place SF_DL size value depending on the SF_DL ESC offset */
+ if (off)
+ cf->data[SF_PCI_SZ4 + ae] = size;
+ else
+ cf->data[ae] |= size;
+
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+
+ /* don't enable wait queue for a single frame transmission */
+ wait_tx_done = 0;
+ } else {
+ /* send first frame and wait for FC */
+
+ isotp_create_fframe(cf, so, ae);
+
+ /* start timeout for FC */
+ hrtimer_start(&so->txtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
+ }
+
+ /* send the first or only CAN frame */
+ if (so->ll.mtu == CANFD_MTU)
+ cf->flags = so->ll.tx_flags;
+
+ skb->dev = dev;
+ skb->sk = sk;
+ err = can_send(skb, 1);
+ dev_put(dev);
+ if (err) {
+ pr_notice_once("can-isotp: %s: can_send_ret %d\n",
+ __func__, err);
+ return err;
+ }
+
+ if (wait_tx_done) {
+ /* wait for complete transmission of current pdu */
+ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+ }
+
+ return size;
+}
+
+static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int err = 0;
+ int noblock;
+
+ noblock = flags & MSG_DONTWAIT;
+ flags &= ~MSG_DONTWAIT;
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ return err;
+
+ if (size < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+ else
+ size = skb->len;
+
+ err = memcpy_to_msg(msg, skb->data, size);
+ if (err < 0) {
+ skb_free_datagram(sk, skb);
+ return err;
+ }
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ if (msg->msg_name) {
+ msg->msg_namelen = sizeof(struct sockaddr_can);
+ memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
+ }
+
+ skb_free_datagram(sk, skb);
+
+ return size;
+}
+
+static int isotp_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so;
+ struct net *net;
+
+ if (!sk)
+ return 0;
+
+ so = isotp_sk(sk);
+ net = sock_net(sk);
+
+ /* wait for complete transmission of current pdu */
+ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+
+ unregister_netdevice_notifier(&so->notifier);
+
+ lock_sock(sk);
+
+ hrtimer_cancel(&so->txtimer);
+ hrtimer_cancel(&so->rxtimer);
+
+ /* remove current filters & unregister */
+ if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) {
+ if (so->ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, so->ifindex);
+ if (dev) {
+ can_rx_unregister(net, dev, so->rxid,
+ SINGLE_MASK(so->rxid),
+ isotp_rcv, sk);
+ dev_put(dev);
+ }
+ }
+ }
+
+ so->ifindex = 0;
+ so->bound = 0;
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ release_sock(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct net *net = sock_net(sk);
+ int ifindex;
+ struct net_device *dev;
+ int err = 0;
+ int notify_enetdown = 0;
+ int do_rx_reg = 1;
+
+ if (len < CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp))
+ return -EINVAL;
+
+ /* do not register frame reception for functional addressing */
+ if (so->opt.flags & CAN_ISOTP_SF_BROADCAST)
+ do_rx_reg = 0;
+
+ /* do not validate rx address for functional addressing */
+ if (do_rx_reg) {
+ if (addr->can_addr.tp.rx_id == addr->can_addr.tp.tx_id)
+ return -EADDRNOTAVAIL;
+
+ if (addr->can_addr.tp.rx_id & (CAN_ERR_FLAG | CAN_RTR_FLAG))
+ return -EADDRNOTAVAIL;
+ }
+
+ if (addr->can_addr.tp.tx_id & (CAN_ERR_FLAG | CAN_RTR_FLAG))
+ return -EADDRNOTAVAIL;
+
+ if (!addr->can_ifindex)
+ return -ENODEV;
+
+ lock_sock(sk);
+
+ if (so->bound && addr->can_ifindex == so->ifindex &&
+ addr->can_addr.tp.rx_id == so->rxid &&
+ addr->can_addr.tp.tx_id == so->txid)
+ goto out;
+
+ dev = dev_get_by_index(net, addr->can_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (dev->type != ARPHRD_CAN) {
+ dev_put(dev);
+ err = -ENODEV;
+ goto out;
+ }
+ if (dev->mtu < so->ll.mtu) {
+ dev_put(dev);
+ err = -EINVAL;
+ goto out;
+ }
+ if (!(dev->flags & IFF_UP))
+ notify_enetdown = 1;
+
+ ifindex = dev->ifindex;
+
+ if (do_rx_reg)
+ can_rx_register(net, dev, addr->can_addr.tp.rx_id,
+ SINGLE_MASK(addr->can_addr.tp.rx_id),
+ isotp_rcv, sk, "isotp", sk);
+
+ dev_put(dev);
+
+ if (so->bound && do_rx_reg) {
+ /* unregister old filter */
+ if (so->ifindex) {
+ dev = dev_get_by_index(net, so->ifindex);
+ if (dev) {
+ can_rx_unregister(net, dev, so->rxid,
+ SINGLE_MASK(so->rxid),
+ isotp_rcv, sk);
+ dev_put(dev);
+ }
+ }
+ }
+
+ /* switch to new settings */
+ so->ifindex = ifindex;
+ so->rxid = addr->can_addr.tp.rx_id;
+ so->txid = addr->can_addr.tp.tx_id;
+ so->bound = 1;
+
+out:
+ release_sock(sk);
+
+ if (notify_enetdown) {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ }
+
+ return err;
+}
+
+static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+
+ if (peer)
+ return -EOPNOTSUPP;
+
+ memset(addr, 0, sizeof(*addr));
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = so->ifindex;
+ addr->can_addr.tp.rx_id = so->rxid;
+ addr->can_addr.tp.tx_id = so->txid;
+
+ return sizeof(*addr);
+}
+
+static int isotp_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ int ret = 0;
+
+ if (level != SOL_CAN_ISOTP)
+ return -EINVAL;
+
+ if (so->bound)
+ return -EISCONN;
+
+ switch (optname) {
+ case CAN_ISOTP_OPTS:
+ if (optlen != sizeof(struct can_isotp_options))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->opt, optval, optlen))
+ return -EFAULT;
+
+ /* no separate rx_ext_address is given => use ext_address */
+ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR))
+ so->opt.rx_ext_address = so->opt.ext_address;
+ break;
+
+ case CAN_ISOTP_RECV_FC:
+ if (optlen != sizeof(struct can_isotp_fc_options))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->rxfc, optval, optlen))
+ return -EFAULT;
+ break;
+
+ case CAN_ISOTP_TX_STMIN:
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen))
+ return -EFAULT;
+ break;
+
+ case CAN_ISOTP_RX_STMIN:
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen))
+ return -EFAULT;
+ break;
+
+ case CAN_ISOTP_LL_OPTS:
+ if (optlen == sizeof(struct can_isotp_ll_options)) {
+ struct can_isotp_ll_options ll;
+
+ if (copy_from_sockptr(&ll, optval, optlen))
+ return -EFAULT;
+
+ /* check for correct ISO 11898-1 DLC data length */
+ if (ll.tx_dl != padlen(ll.tx_dl))
+ return -EINVAL;
+
+ if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU)
+ return -EINVAL;
+
+ if (ll.mtu == CAN_MTU && ll.tx_dl > CAN_MAX_DLEN)
+ return -EINVAL;
+
+ memcpy(&so->ll, &ll, sizeof(ll));
+
+ /* set ll_dl for tx path to similar place as for rx */
+ so->tx.ll_dl = ll.tx_dl;
+ } else {
+ return -EINVAL;
+ }
+ break;
+
+ default:
+ ret = -ENOPROTOOPT;
+ }
+
+ return ret;
+}
+
+static int isotp_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ int len;
+ void *val;
+
+ if (level != SOL_CAN_ISOTP)
+ return -EINVAL;
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case CAN_ISOTP_OPTS:
+ len = min_t(int, len, sizeof(struct can_isotp_options));
+ val = &so->opt;
+ break;
+
+ case CAN_ISOTP_RECV_FC:
+ len = min_t(int, len, sizeof(struct can_isotp_fc_options));
+ val = &so->rxfc;
+ break;
+
+ case CAN_ISOTP_TX_STMIN:
+ len = min_t(int, len, sizeof(u32));
+ val = &so->force_tx_stmin;
+ break;
+
+ case CAN_ISOTP_RX_STMIN:
+ len = min_t(int, len, sizeof(u32));
+ val = &so->force_rx_stmin;
+ break;
+
+ case CAN_ISOTP_LL_OPTS:
+ len = min_t(int, len, sizeof(struct can_isotp_ll_options));
+ val = &so->ll;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, val, len))
+ return -EFAULT;
+ return 0;
+}
+
+static int isotp_notifier(struct notifier_block *nb, unsigned long msg,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct isotp_sock *so = container_of(nb, struct isotp_sock, notifier);
+ struct sock *sk = &so->sk;
+
+ if (!net_eq(dev_net(dev), sock_net(sk)))
+ return NOTIFY_DONE;
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+
+ if (so->ifindex != dev->ifindex)
+ return NOTIFY_DONE;
+
+ switch (msg) {
+ case NETDEV_UNREGISTER:
+ lock_sock(sk);
+ /* remove current filters & unregister */
+ if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST)))
+ can_rx_unregister(dev_net(dev), dev, so->rxid,
+ SINGLE_MASK(so->rxid),
+ isotp_rcv, sk);
+
+ so->ifindex = 0;
+ so->bound = 0;
+ release_sock(sk);
+
+ sk->sk_err = ENODEV;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ break;
+
+ case NETDEV_DOWN:
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int isotp_init(struct sock *sk)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+
+ so->ifindex = 0;
+ so->bound = 0;
+
+ so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS;
+ so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS;
+ so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS;
+ so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
+ so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
+ so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
+ so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS;
+ so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN;
+ so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX;
+ so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU;
+ so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL;
+ so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS;
+
+ /* set ll_dl for tx path to similar place as for rx */
+ so->tx.ll_dl = so->ll.tx_dl;
+
+ so->rx.state = ISOTP_IDLE;
+ so->tx.state = ISOTP_IDLE;
+
+ hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+ so->rxtimer.function = isotp_rx_timer_handler;
+ hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+ so->txtimer.function = isotp_tx_timer_handler;
+
+ init_waitqueue_head(&so->wait);
+
+ so->notifier.notifier_call = isotp_notifier;
+ register_netdevice_notifier(&so->notifier);
+
+ return 0;
+}
+
+static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
+static const struct proto_ops isotp_ops = {
+ .family = PF_CAN,
+ .release = isotp_release,
+ .bind = isotp_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = isotp_getname,
+ .poll = datagram_poll,
+ .ioctl = isotp_sock_no_ioctlcmd,
+ .gettstamp = sock_gettstamp,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = isotp_setsockopt,
+ .getsockopt = isotp_getsockopt,
+ .sendmsg = isotp_sendmsg,
+ .recvmsg = isotp_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct proto isotp_proto __read_mostly = {
+ .name = "CAN_ISOTP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct isotp_sock),
+ .init = isotp_init,
+};
+
+static const struct can_proto isotp_can_proto = {
+ .type = SOCK_DGRAM,
+ .protocol = CAN_ISOTP,
+ .ops = &isotp_ops,
+ .prot = &isotp_proto,
+};
+
+static __init int isotp_module_init(void)
+{
+ int err;
+
+ pr_info("can: isotp protocol\n");
+
+ err = can_proto_register(&isotp_can_proto);
+ if (err < 0)
+ pr_err("can: registration of isotp protocol failed\n");
+
+ return err;
+}
+
+static __exit void isotp_module_exit(void)
+{
+ can_proto_unregister(&isotp_can_proto);
+}
+
+module_init(isotp_module_init);
+module_exit(isotp_module_exit);
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
index 137054bff9ec..bb914d8b4216 100644
--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -62,7 +62,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data)
skb_pull(skb, J1939_CAN_HDR);
/* fix length, set to dlc, with 8 maximum */
- skb_trim(skb, min_t(uint8_t, cf->can_dlc, 8));
+ skb_trim(skb, min_t(uint8_t, cf->len, 8));
/* set addr */
skcb = j1939_skb_to_cb(skb);
@@ -335,7 +335,7 @@ int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb)
canid |= skcb->addr.da << 8;
cf->can_id = canid;
- cf->can_dlc = dlc;
+ cf->len = dlc;
return can_send(skb, 1);
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index 1be4c898b2fa..f23966526a88 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -475,6 +475,12 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
goto out_release_sock;
}
+ if (!(ndev->flags & IFF_UP)) {
+ dev_put(ndev);
+ ret = -ENETDOWN;
+ goto out_release_sock;
+ }
+
priv = j1939_netdev_start(ndev);
dev_put(ndev);
if (IS_ERR(priv)) {
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index 0cec4152f979..e09d087ba240 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -580,6 +580,7 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv,
skb->dev = priv->ndev;
can_skb_reserve(skb);
can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
/* reserve CAN header */
skb_reserve(skb, offsetof(struct can_frame, data));
@@ -1487,6 +1488,7 @@ j1939_session *j1939_session_fresh_new(struct j1939_priv *priv,
skb->dev = priv->ndev;
can_skb_reserve(skb);
can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
skcb = j1939_skb_to_cb(skb);
memcpy(skcb, rel_skcb, sizeof(*skcb));
diff --git a/net/can/proc.c b/net/can/proc.c
index e6881bfc3ed1..5ea8695f507e 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
* proc.c - procfs support for Protocol family CAN core module
*
@@ -54,7 +54,6 @@
* proc filenames for the PF_CAN core
*/
-#define CAN_PROC_VERSION "version"
#define CAN_PROC_STATS "stats"
#define CAN_PROC_RESET_STATS "reset_stats"
#define CAN_PROC_RCVLIST_ALL "rcvlist_all"
@@ -293,12 +292,6 @@ static int can_reset_stats_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int can_version_proc_show(struct seq_file *m, void *v)
-{
- seq_printf(m, "%s\n", CAN_VERSION_STRING);
- return 0;
-}
-
static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
struct net_device *dev,
struct can_dev_rcv_lists *dev_rcv_lists)
@@ -441,8 +434,6 @@ void can_init_proc(struct net *net)
}
/* own procfs entries from the AF_CAN core */
- net->can.pde_version = proc_create_net_single(CAN_PROC_VERSION, 0644,
- net->can.proc_dir, can_version_proc_show, NULL);
net->can.pde_stats = proc_create_net_single(CAN_PROC_STATS, 0644,
net->can.proc_dir, can_stats_proc_show, NULL);
net->can.pde_reset_stats = proc_create_net_single(CAN_PROC_RESET_STATS,
@@ -471,8 +462,8 @@ void can_init_proc(struct net *net)
*/
void can_remove_proc(struct net *net)
{
- if (net->can.pde_version)
- remove_proc_entry(CAN_PROC_VERSION, net->can.proc_dir);
+ if (!net->can.proc_dir)
+ return;
if (net->can.pde_stats)
remove_proc_entry(CAN_PROC_STATS, net->can.proc_dir);
@@ -498,6 +489,5 @@ void can_remove_proc(struct net *net)
if (net->can.pde_rcvlist_sff)
remove_proc_entry(CAN_PROC_RCVLIST_SFF, net->can.proc_dir);
- if (net->can.proc_dir)
- remove_proc_entry("can", net->proc_net);
+ remove_proc_entry("can", net->proc_net);
}
diff --git a/net/can/raw.c b/net/can/raw.c
index 94a9405658dc..6ec8aa1d0da4 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/* raw.c - Raw sockets for protocol family CAN
*
* Copyright (c) 2002-2007 Volkswagen Group Electronic Research
@@ -55,8 +55,6 @@
#include <net/sock.h>
#include <net/net_namespace.h>
-#define CAN_RAW_VERSION CAN_VERSION
-
MODULE_DESCRIPTION("PF_CAN raw protocol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>");
@@ -154,16 +152,16 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
if (!skb)
return;
- /* Put the datagram to the queue so that raw_recvmsg() can
- * get it from there. We need to pass the interface index to
- * raw_recvmsg(). We pass a whole struct sockaddr_can in skb->cb
- * containing the interface index.
+ /* Put the datagram to the queue so that raw_recvmsg() can get
+ * it from there. We need to pass the interface index to
+ * raw_recvmsg(). We pass a whole struct sockaddr_can in
+ * skb->cb containing the interface index.
*/
sock_skb_cb_check_size(sizeof(struct sockaddr_can));
addr = (struct sockaddr_can *)skb->cb;
memset(addr, 0, sizeof(*addr));
- addr->can_family = AF_CAN;
+ addr->can_family = AF_CAN;
addr->can_ifindex = skb->dev->ifindex;
/* add CAN specific message flags for raw_recvmsg() */
@@ -290,8 +288,8 @@ static int raw_notifier(struct notifier_block *nb,
kfree(ro->filter);
ro->ifindex = 0;
- ro->bound = 0;
- ro->count = 0;
+ ro->bound = 0;
+ ro->count = 0;
release_sock(sk);
sk->sk_err = ENODEV;
@@ -374,8 +372,8 @@ static int raw_release(struct socket *sock)
kfree(ro->filter);
ro->ifindex = 0;
- ro->bound = 0;
- ro->count = 0;
+ ro->bound = 0;
+ ro->count = 0;
free_percpu(ro->uniq);
sock_orphan(sk);
@@ -773,7 +771,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
skb_setup_tx_timestamp(skb, sk->sk_tsflags);
skb->dev = dev;
- skb->sk = sk;
+ skb->sk = sk;
skb->priority = sk->sk_priority;
err = can_send(skb, ro->loopback);
@@ -801,8 +799,12 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int err = 0;
int noblock;
- noblock = flags & MSG_DONTWAIT;
- flags &= ~MSG_DONTWAIT;
+ noblock = flags & MSG_DONTWAIT;
+ flags &= ~MSG_DONTWAIT;
+
+ if (flags & MSG_ERRQUEUE)
+ return sock_recv_errqueue(sk, msg, size,
+ SOL_CAN_RAW, SCM_CAN_RAW_ERRQUEUE);
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (!skb)
@@ -881,7 +883,7 @@ static __init int raw_module_init(void)
{
int err;
- pr_info("can: raw protocol (rev " CAN_RAW_VERSION ")\n");
+ pr_info("can: raw protocol\n");
err = can_proto_register(&raw_can_proto);
if (err < 0)
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index f36f9a3a4e20..c5c4eef3a9ff 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -5,6 +5,9 @@ config CEPH_LIB
select LIBCRC32C
select CRYPTO_AES
select CRYPTO_CBC
+ select CRYPTO_GCM
+ select CRYPTO_HMAC
+ select CRYPTO_SHA256
select CRYPTO
select KEYS
default n
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index ce09bb4fb249..8802a0c0155d 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -14,4 +14,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
crypto.o armor.o \
auth_x.o \
ceph_strings.o ceph_hash.o \
- pagevec.o snapshot.o string_table.o
+ pagevec.o snapshot.o string_table.o \
+ messenger_v1.o messenger_v2.o
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index fbeee068ea14..eb261aa5fe18 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -21,28 +21,31 @@ static u32 supported_protocols[] = {
CEPH_AUTH_CEPHX
};
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+static int init_protocol(struct ceph_auth_client *ac, int proto)
{
- switch (protocol) {
+ dout("%s proto %d\n", __func__, proto);
+
+ switch (proto) {
case CEPH_AUTH_NONE:
return ceph_auth_none_init(ac);
case CEPH_AUTH_CEPHX:
return ceph_x_init(ac);
default:
- return -ENOENT;
+ pr_err("bad auth protocol %d\n", proto);
+ return -EINVAL;
}
}
/*
* setup, teardown.
*/
-struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key)
+struct ceph_auth_client *ceph_auth_init(const char *name,
+ const struct ceph_crypto_key *key,
+ const int *con_modes)
{
struct ceph_auth_client *ac;
int ret;
- dout("auth_init name '%s'\n", name);
-
ret = -ENOMEM;
ac = kzalloc(sizeof(*ac), GFP_NOFS);
if (!ac)
@@ -54,8 +57,12 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
ac->name = name;
else
ac->name = CEPH_AUTH_NAME_DEFAULT;
- dout("auth_init name %s\n", ac->name);
ac->key = key;
+ ac->preferred_mode = con_modes[0];
+ ac->fallback_mode = con_modes[1];
+
+ dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__,
+ ac->name, ac->preferred_mode, ac->fallback_mode);
return ac;
out:
@@ -145,31 +152,35 @@ bad:
goto out;
}
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
- void *msg_buf, size_t msg_len)
+static int build_request(struct ceph_auth_client *ac, bool add_header,
+ void *buf, int buf_len)
{
- struct ceph_mon_request_header *monhdr = msg_buf;
- void *p = monhdr + 1;
- void *end = msg_buf + msg_len;
+ void *end = buf + buf_len;
+ void *p;
int ret;
- monhdr->have_version = 0;
- monhdr->session_mon = cpu_to_le16(-1);
- monhdr->session_mon_tid = 0;
-
- ceph_encode_32(&p, ac->protocol);
+ p = buf;
+ if (add_header) {
+ /* struct ceph_mon_request_header + protocol */
+ ceph_encode_64_safe(&p, end, 0, e_range);
+ ceph_encode_16_safe(&p, end, -1, e_range);
+ ceph_encode_64_safe(&p, end, 0, e_range);
+ ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+ }
+ ceph_encode_need(&p, end, sizeof(u32), e_range);
ret = ac->ops->build_request(ac, p + sizeof(u32), end);
if (ret < 0) {
- pr_err("error %d building auth method %s request\n", ret,
- ac->ops->name);
- goto out;
+ pr_err("auth protocol '%s' building request failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), ret);
+ return ret;
}
dout(" built request %d bytes\n", ret);
ceph_encode_32(&p, ret);
- ret = p + ret - msg_buf;
-out:
- return ret;
+ return p + ret - buf;
+
+e_range:
+ return -ERANGE;
}
/*
@@ -229,10 +240,10 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
ac->ops = NULL;
}
if (ac->protocol != protocol) {
- ret = ceph_auth_init_protocol(ac, protocol);
+ ret = init_protocol(ac, protocol);
if (ret) {
- pr_err("error %d on auth protocol %d init\n",
- ret, protocol);
+ pr_err("auth protocol '%s' init failed: %d\n",
+ ceph_auth_proto_name(protocol), ret);
goto out;
}
}
@@ -240,12 +251,13 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
ac->negotiating = false;
}
- ret = ac->ops->handle_reply(ac, result, payload, payload_end);
- if (ret == -EAGAIN) {
- ret = ceph_build_auth_request(ac, reply_buf, reply_len);
- } else if (ret) {
- pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
- }
+ ret = ac->ops->handle_reply(ac, result, payload, payload_end,
+ NULL, NULL, NULL, NULL);
+ if (ret == -EAGAIN)
+ ret = build_request(ac, true, reply_buf, reply_len);
+ else if (ret)
+ pr_err("auth protocol '%s' mauth authentication failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), result);
out:
mutex_unlock(&ac->mutex);
@@ -264,7 +276,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
mutex_lock(&ac->mutex);
if (ac->ops->should_authenticate(ac))
- ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+ ret = build_request(ac, true, msg_buf, msg_len);
mutex_unlock(&ac->mutex);
return ret;
}
@@ -281,19 +293,38 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
}
EXPORT_SYMBOL(ceph_auth_is_authenticated);
-int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
- int peer_type,
- struct ceph_auth_handshake *auth)
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, bool force_new,
+ int *proto, int *pref_mode, int *fallb_mode)
{
- int ret = 0;
+ int ret;
mutex_lock(&ac->mutex);
- if (ac->ops && ac->ops->create_authorizer)
+ if (force_new && auth->authorizer) {
+ ceph_auth_destroy_authorizer(auth->authorizer);
+ auth->authorizer = NULL;
+ }
+ if (!auth->authorizer)
ret = ac->ops->create_authorizer(ac, peer_type, auth);
+ else if (ac->ops->update_authorizer)
+ ret = ac->ops->update_authorizer(ac, peer_type, auth);
+ else
+ ret = 0;
+ if (ret)
+ goto out;
+
+ *proto = ac->protocol;
+ if (pref_mode && fallb_mode) {
+ *pref_mode = ac->preferred_mode;
+ *fallb_mode = ac->fallback_mode;
+ }
+
+out:
mutex_unlock(&ac->mutex);
return ret;
}
-EXPORT_SYMBOL(ceph_auth_create_authorizer);
+EXPORT_SYMBOL(__ceph_auth_get_authorizer);
void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
{
@@ -301,20 +332,6 @@ void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
}
EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
-int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
- int peer_type,
- struct ceph_auth_handshake *a)
-{
- int ret = 0;
-
- mutex_lock(&ac->mutex);
- if (ac->ops && ac->ops->update_authorizer)
- ret = ac->ops->update_authorizer(ac, peer_type, a);
- mutex_unlock(&ac->mutex);
- return ret;
-}
-EXPORT_SYMBOL(ceph_auth_update_authorizer);
-
int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
struct ceph_authorizer *a,
void *challenge_buf,
@@ -332,13 +349,18 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge);
int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a)
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
{
int ret = 0;
mutex_lock(&ac->mutex);
if (ac->ops && ac->ops->verify_authorizer_reply)
- ret = ac->ops->verify_authorizer_reply(ac, a);
+ ret = ac->ops->verify_authorizer_reply(ac, a,
+ reply, reply_len, session_key, session_key_len,
+ con_secret, con_secret_len);
mutex_unlock(&ac->mutex);
return ret;
}
@@ -352,3 +374,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
mutex_unlock(&ac->mutex);
}
EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
+
+/*
+ * msgr2 authentication
+ */
+
+static bool contains(const int *arr, int cnt, int val)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ if (arr[i] == val)
+ return true;
+ }
+
+ return false;
+}
+
+static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode)
+{
+ WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN);
+ if (fallb_mode != CEPH_CON_MODE_UNKNOWN) {
+ ceph_encode_32_safe(p, end, 2, e_range);
+ ceph_encode_32_safe(p, end, pref_mode, e_range);
+ ceph_encode_32_safe(p, end, fallb_mode, e_range);
+ } else {
+ ceph_encode_32_safe(p, end, 1, e_range);
+ ceph_encode_32_safe(p, end, pref_mode, e_range);
+ }
+
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+
+/*
+ * Similar to ceph_auth_build_hello().
+ */
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len)
+{
+ int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE;
+ void *end = buf + buf_len;
+ void *lenp;
+ void *p;
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ if (ac->protocol == CEPH_AUTH_UNKNOWN) {
+ ret = init_protocol(ac, proto);
+ if (ret) {
+ pr_err("auth protocol '%s' init failed: %d\n",
+ ceph_auth_proto_name(proto), ret);
+ goto out;
+ }
+ } else {
+ WARN_ON(ac->protocol != proto);
+ ac->ops->reset(ac);
+ }
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+ ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode);
+ if (ret)
+ goto out;
+
+ lenp = p;
+ p += 4; /* space for len */
+
+ ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range);
+ ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+ if (ret)
+ goto out;
+
+ ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+ ceph_encode_32(&lenp, p - lenp - 4);
+ ret = p - buf;
+
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+
+e_range:
+ ret = -ERANGE;
+ goto out;
+}
+
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+ int reply_len, void *buf, int buf_len)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+ NULL, NULL, NULL, NULL);
+ if (ret == -EAGAIN)
+ ret = build_request(ac, false, buf, buf_len);
+ else
+ WARN_ON(ret >= 0);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ if (global_id && ac->global_id != global_id) {
+ dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
+ global_id);
+ ac->global_id = global_id;
+ }
+
+ ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ mutex_lock(&ac->mutex);
+ WARN_ON(used_proto != ac->protocol);
+
+ if (result == -EOPNOTSUPP) {
+ if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+ pr_err("auth protocol '%s' not allowed\n",
+ ceph_auth_proto_name(ac->protocol));
+ goto not_allowed;
+ }
+ if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+ (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+ !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+ pr_err("preferred mode '%s' not allowed\n",
+ ceph_con_mode_name(ac->preferred_mode));
+ if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+ pr_err("no fallback mode\n");
+ else
+ pr_err("fallback mode '%s' not allowed\n",
+ ceph_con_mode_name(ac->fallback_mode));
+ goto not_allowed;
+ }
+ }
+
+ WARN_ON(result == -EOPNOTSUPP || result >= 0);
+ pr_err("auth protocol '%s' msgr authentication failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), result);
+
+ mutex_unlock(&ac->mutex);
+ return true;
+
+not_allowed:
+ mutex_unlock(&ac->mutex);
+ return false;
+}
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, void *buf, int *buf_len)
+{
+ void *end = buf + *buf_len;
+ int pref_mode, fallb_mode;
+ int proto;
+ void *p;
+ int ret;
+
+ ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto,
+ &pref_mode, &fallb_mode);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, proto, e_range);
+ ret = encode_con_modes(&p, end, pref_mode, fallb_mode);
+ if (ret)
+ return ret;
+
+ ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+ *buf_len = p - buf;
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_get_authorizer);
+
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ void *buf, int *buf_len)
+{
+ void *end = buf + *buf_len;
+ void *p;
+ int ret;
+
+ ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer,
+ reply, reply_len);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+ *buf_len = p - buf;
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more);
+
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ reply, reply_len, session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done);
+
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+ int peer_type, int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ mutex_lock(&ac->mutex);
+ WARN_ON(used_proto != ac->protocol);
+
+ if (result == -EOPNOTSUPP) {
+ if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+ pr_err("auth protocol '%s' not allowed by %s\n",
+ ceph_auth_proto_name(ac->protocol),
+ ceph_entity_type_name(peer_type));
+ goto not_allowed;
+ }
+ if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+ (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+ !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+ pr_err("preferred mode '%s' not allowed by %s\n",
+ ceph_con_mode_name(ac->preferred_mode),
+ ceph_entity_type_name(peer_type));
+ if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+ pr_err("no fallback mode\n");
+ else
+ pr_err("fallback mode '%s' not allowed by %s\n",
+ ceph_con_mode_name(ac->fallback_mode),
+ ceph_entity_type_name(peer_type));
+ goto not_allowed;
+ }
+ }
+
+ WARN_ON(result == -EOPNOTSUPP || result >= 0);
+ pr_err("auth protocol '%s' authorization to %s failed: %d\n",
+ ceph_auth_proto_name(ac->protocol),
+ ceph_entity_type_name(peer_type), result);
+
+ if (ac->ops->invalidate_authorizer)
+ ac->ops->invalidate_authorizer(ac, peer_type);
+
+ mutex_unlock(&ac->mutex);
+ return true;
+
+not_allowed:
+ mutex_unlock(&ac->mutex);
+ return false;
+}
+EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer);
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index edb7042479ed..70e86e462250 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -70,7 +70,9 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
* authenticate state, so nothing happens here.
*/
static int handle_reply(struct ceph_auth_client *ac, int result,
- void *buf, void *end)
+ void *buf, void *end, u8 *session_key,
+ int *session_key_len, u8 *con_secret,
+ int *con_secret_len)
{
struct ceph_auth_none_info *xi = ac->private;
@@ -116,7 +118,6 @@ static int ceph_auth_none_create_authorizer(
}
static const struct ceph_auth_client_ops ceph_auth_none_ops = {
- .name = "none",
.reset = reset,
.destroy = destroy,
.is_authenticated = is_authenticated,
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index b52732337ca6..ca44c327bace 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -22,12 +22,15 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
{
struct ceph_x_info *xi = ac->private;
- int need;
+ int missing;
+ int need; /* missing + need renewal */
ceph_x_validate_tickets(ac, &need);
- dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
- ac->want_keys, need, xi->have_keys);
- return (ac->want_keys & xi->have_keys) == ac->want_keys;
+ missing = ac->want_keys & ~xi->have_keys;
+ WARN_ON((need & missing) != missing);
+ dout("%s want 0x%x have 0x%x missing 0x%x -> %d\n", __func__,
+ ac->want_keys, xi->have_keys, missing, !missing);
+ return !missing;
}
static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
@@ -36,9 +39,9 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
int need;
ceph_x_validate_tickets(ac, &need);
- dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
- ac->want_keys, need, xi->have_keys);
- return need != 0;
+ dout("%s want 0x%x have 0x%x need 0x%x -> %d\n", __func__,
+ ac->want_keys, xi->have_keys, need, !!need);
+ return !!need;
}
static int ceph_x_encrypt_offset(void)
@@ -197,7 +200,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
dout(" decrypted %d bytes\n", ret);
dend = dp + ret;
- tkt_struct_v = ceph_decode_8(&dp);
+ ceph_decode_8_safe(&dp, dend, tkt_struct_v, bad);
if (tkt_struct_v != 1)
goto bad;
@@ -205,6 +208,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
if (ret)
goto out;
+ ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad);
ceph_decode_timespec64(&validity, dp);
dp += sizeof(struct ceph_timespec);
new_expires = ktime_get_real_seconds() + validity.tv_sec;
@@ -265,22 +269,21 @@ out:
static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
struct ceph_crypto_key *secret,
- void *buf, void *end)
+ void **p, void *end)
{
- void *p = buf;
u8 reply_struct_v;
u32 num;
int ret;
- ceph_decode_8_safe(&p, end, reply_struct_v, bad);
+ ceph_decode_8_safe(p, end, reply_struct_v, bad);
if (reply_struct_v != 1)
return -EINVAL;
- ceph_decode_32_safe(&p, end, num, bad);
+ ceph_decode_32_safe(p, end, num, bad);
dout("%d tickets\n", num);
while (num--) {
- ret = process_one_ticket(ac, secret, &p, end);
+ ret = process_one_ticket(ac, secret, p, end);
if (ret)
return ret;
}
@@ -379,6 +382,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
}
}
au->service = th->service;
+ WARN_ON(!th->secret_id);
au->secret_id = th->secret_id;
msg_a = au->buf->vec.iov_base;
@@ -442,9 +446,10 @@ static bool need_key(struct ceph_x_ticket_handler *th)
static bool have_key(struct ceph_x_ticket_handler *th)
{
- if (th->have_key) {
- if (ktime_get_real_seconds() >= th->expires)
- th->have_key = false;
+ if (th->have_key && ktime_get_real_seconds() >= th->expires) {
+ dout("ticket %d (%s) secret_id %llu expired\n", th->service,
+ ceph_entity_type_name(th->service), th->secret_id);
+ th->have_key = false;
}
return th->have_key;
@@ -486,6 +491,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
struct ceph_x_info *xi = ac->private;
int need;
struct ceph_x_request_header *head = buf;
+ void *p;
int ret;
struct ceph_x_ticket_handler *th =
get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
@@ -494,18 +500,17 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
return PTR_ERR(th);
ceph_x_validate_tickets(ac, &need);
-
- dout("build_request want %x have %x need %x\n",
- ac->want_keys, xi->have_keys, need);
+ dout("%s want 0x%x have 0x%x need 0x%x\n", __func__, ac->want_keys,
+ xi->have_keys, need);
if (need & CEPH_ENTITY_TYPE_AUTH) {
struct ceph_x_authenticate *auth = (void *)(head + 1);
- void *p = auth + 1;
void *enc_buf = xi->auth_authorizer.enc_buf;
struct ceph_x_challenge_blob *blob = enc_buf +
ceph_x_encrypt_offset();
u64 *u;
+ p = auth + 1;
if (p > end)
return -ERANGE;
@@ -521,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
if (ret < 0)
return ret;
- auth->struct_v = 1;
+ auth->struct_v = 2; /* nautilus+ */
auth->key = 0;
for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
auth->key ^= *(__le64 *)u;
@@ -534,39 +539,137 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
if (ret < 0)
return ret;
+ /* nautilus+: request service tickets at the same time */
+ need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH;
+ WARN_ON(!need);
+ ceph_encode_32_safe(&p, end, need, e_range);
return p - buf;
}
if (need) {
- void *p = head + 1;
- struct ceph_x_service_ticket_request *req;
-
- if (p > end)
- return -ERANGE;
- head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-
+ dout(" get_principal_session_key\n");
ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
if (ret)
return ret;
- ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
- xi->auth_authorizer.buf->vec.iov_len);
- req = p;
- req->keys = cpu_to_le32(need);
- p += sizeof(*req);
+ p = buf;
+ ceph_encode_16_safe(&p, end, CEPHX_GET_PRINCIPAL_SESSION_KEY,
+ e_range);
+ ceph_encode_copy_safe(&p, end,
+ xi->auth_authorizer.buf->vec.iov_base,
+ xi->auth_authorizer.buf->vec.iov_len, e_range);
+ ceph_encode_8_safe(&p, end, 1, e_range);
+ ceph_encode_32_safe(&p, end, need, e_range);
return p - buf;
}
return 0;
+
+e_range:
+ return -ERANGE;
+}
+
+static int decode_con_secret(void **p, void *end, u8 *con_secret,
+ int *con_secret_len)
+{
+ int len;
+
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+
+ dout("%s len %d\n", __func__, len);
+ if (con_secret) {
+ if (len > CEPH_MAX_CON_SECRET_LEN) {
+ pr_err("connection secret too big %d\n", len);
+ goto bad_memzero;
+ }
+ memcpy(con_secret, *p, len);
+ *con_secret_len = len;
+ }
+ memzero_explicit(*p, len);
+ *p += len;
+ return 0;
+
+bad_memzero:
+ memzero_explicit(*p, len);
+bad:
+ pr_err("failed to decode connection secret\n");
+ return -EINVAL;
+}
+
+static int handle_auth_session_key(struct ceph_auth_client *ac,
+ void **p, void *end,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct ceph_x_ticket_handler *th;
+ void *dp, *dend;
+ int len;
+ int ret;
+
+ /* AUTH ticket */
+ ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+ if (ret)
+ return ret;
+
+ if (*p == end) {
+ /* pre-nautilus (or didn't request service tickets!) */
+ WARN_ON(session_key || con_secret);
+ return 0;
+ }
+
+ th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ if (session_key) {
+ memcpy(session_key, th->session_key.key, th->session_key.len);
+ *session_key_len = th->session_key.len;
+ }
+
+ /* connection secret */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ dout("%s connection secret blob len %d\n", __func__, len);
+ if (len > 0) {
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(&th->session_key, p, *p + len);
+ if (ret < 0)
+ return ret;
+
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dend = dp + ret;
+
+ ret = decode_con_secret(&dp, dend, con_secret, con_secret_len);
+ if (ret)
+ return ret;
+ }
+
+ /* service tickets */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ dout("%s service tickets blob len %d\n", __func__, len);
+ if (len > 0) {
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+ p, *p + len);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
}
static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
- void *buf, void *end)
+ void *buf, void *end,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
{
struct ceph_x_info *xi = ac->private;
- struct ceph_x_reply_header *head = buf;
struct ceph_x_ticket_handler *th;
int len = end - buf;
+ void *p;
int op;
int ret;
@@ -587,22 +690,25 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
return -EAGAIN;
}
- op = le16_to_cpu(head->op);
- result = le32_to_cpu(head->result);
+ p = buf;
+ ceph_decode_16_safe(&p, end, op, e_inval);
+ ceph_decode_32_safe(&p, end, result, e_inval);
dout("handle_reply op %d result %d\n", op, result);
switch (op) {
case CEPHX_GET_AUTH_SESSION_KEY:
- /* verify auth key */
- ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
- buf + sizeof(*head), end);
+ /* AUTH ticket + [connection secret] + service tickets */
+ ret = handle_auth_session_key(ac, &p, end, session_key,
+ session_key_len, con_secret,
+ con_secret_len);
break;
case CEPHX_GET_PRINCIPAL_SESSION_KEY:
th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
if (IS_ERR(th))
return PTR_ERR(th);
- ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
- buf + sizeof(*head), end);
+
+ /* service tickets */
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end);
break;
default:
@@ -613,6 +719,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
if (ac->want_keys == xi->have_keys)
return 0;
return -EAGAIN;
+
+e_inval:
+ return -EINVAL;
}
static void ceph_x_destroy_authorizer(struct ceph_authorizer *a)
@@ -678,40 +787,44 @@ static int ceph_x_update_authorizer(
return 0;
}
-static int decrypt_authorize_challenge(struct ceph_x_authorizer *au,
- void *challenge_buf,
- int challenge_buf_len,
- u64 *server_challenge)
+/*
+ * CephXAuthorizeChallenge
+ */
+static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret,
+ void *challenge, int challenge_len,
+ u64 *server_challenge)
{
- struct ceph_x_authorize_challenge *ch =
- challenge_buf + sizeof(struct ceph_x_encrypt_header);
+ void *dp, *dend;
int ret;
/* no leading len */
- ret = __ceph_x_decrypt(&au->session_key, challenge_buf,
- challenge_buf_len);
+ ret = __ceph_x_decrypt(secret, challenge, challenge_len);
if (ret < 0)
return ret;
- if (ret < sizeof(*ch)) {
- pr_err("bad size %d for ceph_x_authorize_challenge\n", ret);
- return -EINVAL;
- }
- *server_challenge = le64_to_cpu(ch->server_challenge);
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dp = challenge + sizeof(struct ceph_x_encrypt_header);
+ dend = dp + ret;
+
+ ceph_decode_skip_8(&dp, dend, e_inval); /* struct_v */
+ ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval);
+ dout("%s server_challenge %llu\n", __func__, *server_challenge);
return 0;
+
+e_inval:
+ return -EINVAL;
}
static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
struct ceph_authorizer *a,
- void *challenge_buf,
- int challenge_buf_len)
+ void *challenge, int challenge_len)
{
struct ceph_x_authorizer *au = (void *)a;
u64 server_challenge;
int ret;
- ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len,
- &server_challenge);
+ ret = decrypt_authorizer_challenge(&au->session_key, challenge,
+ challenge_len, &server_challenge);
if (ret) {
pr_err("failed to decrypt authorize challenge: %d", ret);
return ret;
@@ -726,29 +839,67 @@ static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
return 0;
}
+/*
+ * CephXAuthorizeReply
+ */
+static int decrypt_authorizer_reply(struct ceph_crypto_key *secret,
+ void **p, void *end, u64 *nonce_plus_one,
+ u8 *con_secret, int *con_secret_len)
+{
+ void *dp, *dend;
+ u8 struct_v;
+ int ret;
+
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(secret, p, end);
+ if (ret < 0)
+ return ret;
+
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dend = dp + ret;
+
+ ceph_decode_8_safe(&dp, dend, struct_v, e_inval);
+ ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval);
+ dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one);
+ if (struct_v >= 2) {
+ ret = decode_con_secret(&dp, dend, con_secret, con_secret_len);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a)
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
{
struct ceph_x_authorizer *au = (void *)a;
- void *p = au->enc_buf;
- struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset();
+ u64 nonce_plus_one;
int ret;
- ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
- if (ret < 0)
+ if (session_key) {
+ memcpy(session_key, au->session_key.key, au->session_key.len);
+ *session_key_len = au->session_key.len;
+ }
+
+ ret = decrypt_authorizer_reply(&au->session_key, &reply,
+ reply + reply_len, &nonce_plus_one,
+ con_secret, con_secret_len);
+ if (ret)
return ret;
- if (ret < sizeof(*reply)) {
- pr_err("bad size %d for ceph_x_authorize_reply\n", ret);
- return -EINVAL;
+
+ if (nonce_plus_one != au->nonce + 1) {
+ pr_err("failed to authenticate server\n");
+ return -EPERM;
}
- if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
- ret = -EPERM;
- else
- ret = 0;
- dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
- au->nonce, le64_to_cpu(reply->nonce_plus_one), ret);
- return ret;
+ return 0;
}
static void ceph_x_reset(struct ceph_auth_client *ac)
@@ -785,8 +936,15 @@ static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
struct ceph_x_ticket_handler *th;
th = get_ticket_handler(ac, peer_type);
- if (!IS_ERR(th))
+ if (IS_ERR(th))
+ return;
+
+ if (th->have_key) {
+ dout("ticket %d (%s) secret_id %llu invalidated\n",
+ th->service, ceph_entity_type_name(th->service),
+ th->secret_id);
th->have_key = false;
+ }
}
static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
@@ -911,7 +1069,6 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
}
static const struct ceph_auth_client_ops ceph_x_ops = {
- .name = "x",
.is_authenticated = ceph_x_is_authenticated,
.should_authenticate = ceph_x_should_authenticate,
.build_request = ceph_x_build_request,
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 24b0b74564d0..792fcb974dc3 100644
--- a/net/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
@@ -38,7 +38,8 @@ struct ceph_x_authenticate {
__u8 struct_v;
__le64 client_challenge;
__le64 key;
- /* ticket blob */
+ /* old_ticket blob */
+ /* nautilus+: other_keys */
} __attribute__ ((packed));
struct ceph_x_service_ticket_request {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4e7edd707a14..271287c5ec12 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -265,6 +265,7 @@ enum {
Opt_ip,
Opt_crush_location,
Opt_read_from_replica,
+ Opt_ms_mode,
/* string args above */
Opt_share,
Opt_crc,
@@ -287,6 +288,23 @@ static const struct constant_table ceph_param_read_from_replica[] = {
{}
};
+enum ceph_ms_mode {
+ Opt_ms_mode_legacy,
+ Opt_ms_mode_crc,
+ Opt_ms_mode_secure,
+ Opt_ms_mode_prefer_crc,
+ Opt_ms_mode_prefer_secure
+};
+
+static const struct constant_table ceph_param_ms_mode[] = {
+ {"legacy", Opt_ms_mode_legacy},
+ {"crc", Opt_ms_mode_crc},
+ {"secure", Opt_ms_mode_secure},
+ {"prefer-crc", Opt_ms_mode_prefer_crc},
+ {"prefer-secure", Opt_ms_mode_prefer_secure},
+ {}
+};
+
static const struct fs_parameter_spec ceph_parameters[] = {
fsparam_flag ("abort_on_full", Opt_abort_on_full),
fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures),
@@ -305,6 +323,8 @@ static const struct fs_parameter_spec ceph_parameters[] = {
fs_param_deprecated, NULL),
fsparam_enum ("read_from_replica", Opt_read_from_replica,
ceph_param_read_from_replica),
+ fsparam_enum ("ms_mode", Opt_ms_mode,
+ ceph_param_ms_mode),
fsparam_string ("secret", Opt_secret),
fsparam_flag_no ("share", Opt_share),
fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay),
@@ -333,6 +353,8 @@ struct ceph_options *ceph_alloc_options(void)
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT;
+ opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
return opt;
}
EXPORT_SYMBOL(ceph_alloc_options);
@@ -503,6 +525,32 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
BUG();
}
break;
+ case Opt_ms_mode:
+ switch (result.uint_32) {
+ case Opt_ms_mode_legacy:
+ opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ break;
+ case Opt_ms_mode_crc:
+ opt->con_modes[0] = CEPH_CON_MODE_CRC;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ break;
+ case Opt_ms_mode_secure:
+ opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ break;
+ case Opt_ms_mode_prefer_crc:
+ opt->con_modes[0] = CEPH_CON_MODE_CRC;
+ opt->con_modes[1] = CEPH_CON_MODE_SECURE;
+ break;
+ case Opt_ms_mode_prefer_secure:
+ opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+ opt->con_modes[1] = CEPH_CON_MODE_CRC;
+ break;
+ default:
+ BUG();
+ }
+ break;
case Opt_osdtimeout:
warn_plog(&log, "Ignoring osdtimeout");
@@ -616,6 +664,21 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
} else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) {
seq_puts(m, "read_from_replica=localize,");
}
+ if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) {
+ if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+ opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+ seq_puts(m, "ms_mode=crc,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+ opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+ seq_puts(m, "ms_mode=secure,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+ opt->con_modes[1] == CEPH_CON_MODE_SECURE) {
+ seq_puts(m, "ms_mode=prefer-crc,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+ opt->con_modes[1] == CEPH_CON_MODE_CRC) {
+ seq_puts(m, "ms_mode=prefer-secure,");
+ }
+ }
if (opt->flags & CEPH_OPT_FSID)
seq_printf(m, "fsid=%pU,", &opt->fsid);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 10e01494993c..355fea272120 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -18,6 +18,34 @@ const char *ceph_entity_type_name(int type)
}
EXPORT_SYMBOL(ceph_entity_type_name);
+const char *ceph_auth_proto_name(int proto)
+{
+ switch (proto) {
+ case CEPH_AUTH_UNKNOWN:
+ return "unknown";
+ case CEPH_AUTH_NONE:
+ return "none";
+ case CEPH_AUTH_CEPHX:
+ return "cephx";
+ default:
+ return "???";
+ }
+}
+
+const char *ceph_con_mode_name(int mode)
+{
+ switch (mode) {
+ case CEPH_CON_MODE_UNKNOWN:
+ return "unknown";
+ case CEPH_CON_MODE_CRC:
+ return "crc";
+ case CEPH_CON_MODE_SECURE:
+ return "secure";
+ default:
+ return "???";
+ }
+}
+
const char *ceph_osd_op_name(int op)
{
switch (op) {
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 4f75df40fb12..92d89b331645 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -96,6 +96,7 @@ int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
key->len = ceph_decode_16(p);
ceph_decode_need(p, end, key->len, bad);
ret = set_secret(key, *p);
+ memzero_explicit(*p, key->len);
*p += key->len;
return ret;
@@ -134,7 +135,7 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
{
if (key) {
- kfree(key->key);
+ kfree_sensitive(key->key);
key->key = NULL;
if (key->tfm) {
crypto_free_sync_skcipher(key->tfm);
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 96ef4d860bc9..13bd526349fa 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -5,6 +5,9 @@
#include <linux/ceph/types.h>
#include <linux/ceph/buffer.h>
+#define CEPH_KEY_LEN 16
+#define CEPH_MAX_CON_SECRET_LEN 64
+
/*
* cryptographic secret
*/
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index eea529595a7a..b44f7651be04 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -1,4 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/inet.h>
#include <linux/ceph/decode.h>
@@ -82,3 +85,101 @@ bad:
}
EXPORT_SYMBOL(ceph_decode_entity_addr);
+/*
+ * Return addr of desired type (MSGR2 or LEGACY) or error.
+ * Make sure there is only one match.
+ *
+ * Assume encoding with MSG_ADDR2.
+ */
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr)
+{
+ __le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 :
+ CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ struct ceph_entity_addr tmp_addr;
+ int addr_cnt;
+ bool found;
+ u8 marker;
+ int ret;
+ int i;
+
+ ceph_decode_8_safe(p, end, marker, e_inval);
+ if (marker != 2) {
+ pr_err("bad addrvec marker %d\n", marker);
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+
+ found = false;
+ for (i = 0; i < addr_cnt; i++) {
+ ret = ceph_decode_entity_addr(p, end, &tmp_addr);
+ if (ret)
+ return ret;
+
+ if (tmp_addr.type == my_type) {
+ if (found) {
+ pr_err("another match of type %d in addrvec\n",
+ le32_to_cpu(my_type));
+ return -EINVAL;
+ }
+
+ memcpy(addr, &tmp_addr, sizeof(*addr));
+ found = true;
+ }
+ }
+ if (!found && addr_cnt != 0) {
+ pr_err("no match of type %d in addrvec\n",
+ le32_to_cpu(my_type));
+ return -ENOENT;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addrvec);
+
+static int get_sockaddr_encoding_len(sa_family_t family)
+{
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } u;
+
+ switch (family) {
+ case AF_INET:
+ return sizeof(u.sin);
+ case AF_INET6:
+ return sizeof(u.sin6);
+ default:
+ return sizeof(u);
+ }
+}
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr)
+{
+ sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+ int addr_len = get_sockaddr_encoding_len(family);
+
+ return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len;
+}
+
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr)
+{
+ sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+ int addr_len = get_sockaddr_encoding_len(family);
+
+ ceph_encode_8(p, 1); /* marker */
+ ceph_start_encoding(p, 1, 1, sizeof(addr->type) +
+ sizeof(addr->nonce) +
+ sizeof(u32) + addr_len);
+ ceph_encode_copy(p, &addr->type, sizeof(addr->type));
+ ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce));
+
+ ceph_encode_32(p, addr_len);
+ ceph_encode_16(p, family);
+ ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family));
+}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d4d7a0e52491..57d043b382ed 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -82,71 +82,51 @@
#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */
#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */
-/*
- * connection states
- */
-#define CON_STATE_CLOSED 1 /* -> PREOPEN */
-#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */
-#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */
-#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */
-#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */
-#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */
-
-/*
- * ceph_connection flag bits
- */
-#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop
- * messages on errors */
-#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
-#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */
-#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
-#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
-
static bool con_flag_valid(unsigned long con_flag)
{
switch (con_flag) {
- case CON_FLAG_LOSSYTX:
- case CON_FLAG_KEEPALIVE_PENDING:
- case CON_FLAG_WRITE_PENDING:
- case CON_FLAG_SOCK_CLOSED:
- case CON_FLAG_BACKOFF:
+ case CEPH_CON_F_LOSSYTX:
+ case CEPH_CON_F_KEEPALIVE_PENDING:
+ case CEPH_CON_F_WRITE_PENDING:
+ case CEPH_CON_F_SOCK_CLOSED:
+ case CEPH_CON_F_BACKOFF:
return true;
default:
return false;
}
}
-static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
clear_bit(con_flag, &con->flags);
}
-static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
set_bit(con_flag, &con->flags);
}
-static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
return test_bit(con_flag, &con->flags);
}
-static bool con_flag_test_and_clear(struct ceph_connection *con,
- unsigned long con_flag)
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+ unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
return test_and_clear_bit(con_flag, &con->flags);
}
-static bool con_flag_test_and_set(struct ceph_connection *con,
- unsigned long con_flag)
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+ unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
@@ -157,12 +137,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con,
static struct kmem_cache *ceph_msg_cache;
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
-
#ifdef CONFIG_LOCKDEP
static struct lock_class_key socket_class;
#endif
@@ -184,7 +158,7 @@ static void con_fault(struct ceph_connection *con);
static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
static atomic_t addr_str_seq = ATOMIC_INIT(0);
-static struct page *zero_page; /* used in certain error cases */
+struct page *ceph_zero_page; /* used in certain error cases */
const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
{
@@ -219,10 +193,13 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
}
EXPORT_SYMBOL(ceph_pr_addr);
-static void encode_my_addr(struct ceph_messenger *msgr)
+void ceph_encode_my_addr(struct ceph_messenger *msgr)
{
- memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
- ceph_encode_banner_addr(&msgr->my_enc_addr);
+ if (!ceph_msgr2(from_msgr(msgr))) {
+ memcpy(&msgr->my_enc_addr, &msgr->inst.addr,
+ sizeof(msgr->my_enc_addr));
+ ceph_encode_banner_addr(&msgr->my_enc_addr);
+ }
}
/*
@@ -254,9 +231,9 @@ static void _ceph_msgr_exit(void)
ceph_msgr_wq = NULL;
}
- BUG_ON(zero_page == NULL);
- put_page(zero_page);
- zero_page = NULL;
+ BUG_ON(!ceph_zero_page);
+ put_page(ceph_zero_page);
+ ceph_zero_page = NULL;
ceph_msgr_slab_exit();
}
@@ -266,9 +243,9 @@ int __init ceph_msgr_init(void)
if (ceph_msgr_slab_init())
return -ENOMEM;
- BUG_ON(zero_page != NULL);
- zero_page = ZERO_PAGE(0);
- get_page(zero_page);
+ BUG_ON(ceph_zero_page);
+ ceph_zero_page = ZERO_PAGE(0);
+ get_page(ceph_zero_page);
/*
* The number of active work items is limited by the number of
@@ -372,7 +349,7 @@ static void ceph_sock_data_ready(struct sock *sk)
}
if (sk->sk_state != TCP_CLOSE_WAIT) {
- dout("%s on %p state = %lu, queueing work\n", __func__,
+ dout("%s %p state = %d, queueing work\n", __func__,
con, con->state);
queue_con(con);
}
@@ -390,7 +367,7 @@ static void ceph_sock_write_space(struct sock *sk)
* buffer. See net/ipv4/tcp_input.c:tcp_check_space()
* and net/core/stream.c:sk_stream_write_space().
*/
- if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
+ if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
if (sk_stream_is_writeable(sk)) {
dout("%s %p queueing write work\n", __func__, con);
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -406,7 +383,7 @@ static void ceph_sock_state_change(struct sock *sk)
{
struct ceph_connection *con = sk->sk_user_data;
- dout("%s %p state = %lu sk_state = %u\n", __func__,
+ dout("%s %p state = %d sk_state = %u\n", __func__,
con, con->state, sk->sk_state);
switch (sk->sk_state) {
@@ -416,7 +393,7 @@ static void ceph_sock_state_change(struct sock *sk)
case TCP_CLOSE_WAIT:
dout("%s TCP_CLOSE_WAIT\n", __func__);
con_sock_state_closing(con);
- con_flag_set(con, CON_FLAG_SOCK_CLOSED);
+ ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
queue_con(con);
break;
case TCP_ESTABLISHED:
@@ -450,13 +427,15 @@ static void set_sock_callbacks(struct socket *sock,
/*
* initiate connection to a remote socket.
*/
-static int ceph_tcp_connect(struct ceph_connection *con)
+int ceph_tcp_connect(struct ceph_connection *con)
{
struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
struct socket *sock;
unsigned int noio_flag;
int ret;
+ dout("%s con %p peer_addr %s\n", __func__, con,
+ ceph_pr_addr(&con->peer_addr));
BUG_ON(con->sock);
/* sock_create_kern() allocates with GFP_KERNEL */
@@ -474,8 +453,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
set_sock_callbacks(sock, con);
- dout("connect %s\n", ceph_pr_addr(&con->peer_addr));
-
con_sock_state_connecting(con);
ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss),
O_NONBLOCK);
@@ -498,103 +475,13 @@ static int ceph_tcp_connect(struct ceph_connection *con)
}
/*
- * If @buf is NULL, discard up to @len bytes.
- */
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
- struct kvec iov = {buf, len};
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- int r;
-
- if (!buf)
- msg.msg_flags |= MSG_TRUNC;
-
- iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
- r = sock_recvmsg(sock, &msg, msg.msg_flags);
- if (r == -EAGAIN)
- r = 0;
- return r;
-}
-
-static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
- int page_offset, size_t length)
-{
- struct bio_vec bvec = {
- .bv_page = page,
- .bv_offset = page_offset,
- .bv_len = length
- };
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- int r;
-
- BUG_ON(page_offset + length > PAGE_SIZE);
- iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
- r = sock_recvmsg(sock, &msg, msg.msg_flags);
- if (r == -EAGAIN)
- r = 0;
- return r;
-}
-
-/*
- * write something. @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
- size_t kvlen, size_t len, bool more)
-{
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- int r;
-
- if (more)
- msg.msg_flags |= MSG_MORE;
- else
- msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
-
- r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
- if (r == -EAGAIN)
- r = 0;
- return r;
-}
-
-/*
- * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
- */
-static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
- int offset, size_t size, int more)
-{
- ssize_t (*sendpage)(struct socket *sock, struct page *page,
- int offset, size_t size, int flags);
- int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
- int ret;
-
- /*
- * sendpage cannot properly handle pages with page_count == 0,
- * we need to fall back to sendmsg if that's the case.
- *
- * Same goes for slab pages: skb_can_coalesce() allows
- * coalescing neighboring slab objects into a single frag which
- * triggers one of hardened usercopy checks.
- */
- if (sendpage_ok(page))
- sendpage = sock->ops->sendpage;
- else
- sendpage = sock_no_sendpage;
-
- ret = sendpage(sock, page, offset, size, flags);
- if (ret == -EAGAIN)
- ret = 0;
-
- return ret;
-}
-
-/*
* Shutdown/close the socket for the given connection.
*/
-static int con_close_socket(struct ceph_connection *con)
+int ceph_con_close_socket(struct ceph_connection *con)
{
int rc = 0;
- dout("con_close_socket on %p sock %p\n", con, con->sock);
+ dout("%s con %p sock %p\n", __func__, con, con->sock);
if (con->sock) {
rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
sock_release(con->sock);
@@ -607,12 +494,34 @@ static int con_close_socket(struct ceph_connection *con)
* received a socket close event before we had the chance to
* shut the socket down.
*/
- con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
+ ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
con_sock_state_closed(con);
return rc;
}
+static void ceph_con_reset_protocol(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+
+ ceph_con_close_socket(con);
+ if (con->in_msg) {
+ WARN_ON(con->in_msg->con != con);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ }
+ if (con->out_msg) {
+ WARN_ON(con->out_msg->con != con);
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_reset_protocol(con);
+ else
+ ceph_con_v1_reset_protocol(con);
+}
+
/*
* Reset a connection. Discard all incoming and outgoing messages
* and clear *_seq state.
@@ -623,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg)
ceph_msg_put(msg);
}
+
static void ceph_msg_remove_list(struct list_head *head)
{
while (!list_empty(head)) {
@@ -632,31 +542,22 @@ static void ceph_msg_remove_list(struct list_head *head)
}
}
-static void reset_connection(struct ceph_connection *con)
+void ceph_con_reset_session(struct ceph_connection *con)
{
- /* reset connection, out_queue, msg_ and connect_seq */
- /* discard existing out_queue and msg_seq */
- dout("reset_connection %p\n", con);
+ dout("%s con %p\n", __func__, con);
+
+ WARN_ON(con->in_msg);
+ WARN_ON(con->out_msg);
ceph_msg_remove_list(&con->out_queue);
ceph_msg_remove_list(&con->out_sent);
-
- if (con->in_msg) {
- BUG_ON(con->in_msg->con != con);
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- }
-
- con->connect_seq = 0;
con->out_seq = 0;
- if (con->out_msg) {
- BUG_ON(con->out_msg->con != con);
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL;
- }
con->in_seq = 0;
con->in_seq_acked = 0;
- con->out_skip = 0;
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_reset_session(con);
+ else
+ ceph_con_v1_reset_session(con);
}
/*
@@ -666,17 +567,17 @@ void ceph_con_close(struct ceph_connection *con)
{
mutex_lock(&con->mutex);
dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
- con->state = CON_STATE_CLOSED;
+ con->state = CEPH_CON_S_CLOSED;
- con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */
- con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
- con_flag_clear(con, CON_FLAG_WRITE_PENDING);
- con_flag_clear(con, CON_FLAG_BACKOFF);
+ ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next
+ connect */
+ ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF);
- reset_connection(con);
- con->peer_global_seq = 0;
+ ceph_con_reset_protocol(con);
+ ceph_con_reset_session(con);
cancel_con(con);
- con_close_socket(con);
mutex_unlock(&con->mutex);
}
EXPORT_SYMBOL(ceph_con_close);
@@ -691,8 +592,8 @@ void ceph_con_open(struct ceph_connection *con,
mutex_lock(&con->mutex);
dout("con_open %p %s\n", con, ceph_pr_addr(addr));
- WARN_ON(con->state != CON_STATE_CLOSED);
- con->state = CON_STATE_PREOPEN;
+ WARN_ON(con->state != CEPH_CON_S_CLOSED);
+ con->state = CEPH_CON_S_PREOPEN;
con->peer_name.type = (__u8) entity_type;
con->peer_name.num = cpu_to_le64(entity_num);
@@ -709,7 +610,10 @@ EXPORT_SYMBOL(ceph_con_open);
*/
bool ceph_con_opened(struct ceph_connection *con)
{
- return con->connect_seq > 0;
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ return ceph_con_v2_opened(con);
+
+ return ceph_con_v1_opened(con);
}
/*
@@ -732,16 +636,15 @@ void ceph_con_init(struct ceph_connection *con, void *private,
INIT_LIST_HEAD(&con->out_sent);
INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
- con->state = CON_STATE_CLOSED;
+ con->state = CEPH_CON_S_CLOSED;
}
EXPORT_SYMBOL(ceph_con_init);
-
/*
* We maintain a global counter to order connection attempts. Get
* a unique seq greater than @gt.
*/
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt)
{
u32 ret;
@@ -753,48 +656,53 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
return ret;
}
-static void con_out_kvec_reset(struct ceph_connection *con)
-{
- BUG_ON(con->out_skip);
-
- con->out_kvec_left = 0;
- con->out_kvec_bytes = 0;
- con->out_kvec_cur = &con->out_kvec[0];
-}
-
-static void con_out_kvec_add(struct ceph_connection *con,
- size_t size, void *data)
+/*
+ * Discard messages that have been acked by the server.
+ */
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
{
- int index = con->out_kvec_left;
+ struct ceph_msg *msg;
+ u64 seq;
- BUG_ON(con->out_skip);
- BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+ dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq);
+ while (!list_empty(&con->out_sent)) {
+ msg = list_first_entry(&con->out_sent, struct ceph_msg,
+ list_head);
+ WARN_ON(msg->needs_out_seq);
+ seq = le64_to_cpu(msg->hdr.seq);
+ if (seq > ack_seq)
+ break;
- con->out_kvec[index].iov_len = size;
- con->out_kvec[index].iov_base = data;
- con->out_kvec_left++;
- con->out_kvec_bytes += size;
+ dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+ msg, seq);
+ ceph_msg_remove(msg);
+ }
}
/*
- * Chop off a kvec from the end. Return residual number of bytes for
- * that kvec, i.e. how many bytes would have been written if the kvec
- * hadn't been nuked.
+ * Discard messages that have been requeued in con_fault(), up to
+ * reconnect_seq. This avoids gratuitously resending messages that
+ * the server had received and handled prior to reconnect.
*/
-static int con_out_kvec_skip(struct ceph_connection *con)
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
{
- int off = con->out_kvec_cur - con->out_kvec;
- int skip = 0;
+ struct ceph_msg *msg;
+ u64 seq;
- if (con->out_kvec_bytes > 0) {
- skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
- BUG_ON(con->out_kvec_bytes < skip);
- BUG_ON(!con->out_kvec_left);
- con->out_kvec_bytes -= skip;
- con->out_kvec_left--;
- }
+ dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq);
+ while (!list_empty(&con->out_queue)) {
+ msg = list_first_entry(&con->out_queue, struct ceph_msg,
+ list_head);
+ if (msg->needs_out_seq)
+ break;
+ seq = le64_to_cpu(msg->hdr.seq);
+ if (seq > reconnect_seq)
+ break;
- return skip;
+ dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+ msg, seq);
+ ceph_msg_remove(msg);
+ }
}
#ifdef CONFIG_BLOCK
@@ -1113,10 +1021,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
cursor->need_crc = true;
}
-static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+ struct ceph_msg *msg, size_t length)
{
- struct ceph_msg_data_cursor *cursor = &msg->cursor;
-
BUG_ON(!length);
BUG_ON(length > msg->data_length);
BUG_ON(!msg->num_data_items);
@@ -1132,9 +1039,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
* data item, and supply the page offset and length of that piece.
* Indicate whether this is the last piece in this data item.
*/
-static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
- size_t *page_offset, size_t *length,
- bool *last_piece)
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length,
+ bool *last_piece)
{
struct page *page;
@@ -1173,8 +1080,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
* Returns true if the result moves the cursor on to the next piece
* of the data item.
*/
-static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
- size_t bytes)
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
{
bool new_piece;
@@ -1210,328 +1116,8 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
cursor->need_crc = new_piece;
}
-static size_t sizeof_footer(struct ceph_connection *con)
-{
- return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
- sizeof(struct ceph_msg_footer) :
- sizeof(struct ceph_msg_footer_old);
-}
-
-static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
-{
- /* Initialize data cursor */
-
- ceph_msg_data_cursor_init(msg, (size_t)data_len);
-}
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off. Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con)
-{
- struct ceph_msg *m = con->out_msg;
-
- m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
-
- dout("prepare_write_message_footer %p\n", con);
- con_out_kvec_add(con, sizeof_footer(con), &m->footer);
- if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
- if (con->ops->sign_message)
- con->ops->sign_message(m);
- else
- m->footer.sig = 0;
- } else {
- m->old_footer.flags = m->footer.flags;
- }
- con->out_more = m->more_to_follow;
- con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
- struct ceph_msg *m;
- u32 crc;
-
- con_out_kvec_reset(con);
- con->out_msg_done = false;
-
- /* Sneak an ack in there first? If we can get it into the same
- * TCP packet that's a good thing. */
- if (con->in_seq > con->in_seq_acked) {
- con->in_seq_acked = con->in_seq;
- con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con_out_kvec_add(con, sizeof (con->out_temp_ack),
- &con->out_temp_ack);
- }
-
- BUG_ON(list_empty(&con->out_queue));
- m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
- con->out_msg = m;
- BUG_ON(m->con != con);
-
- /* put message on sent list */
- ceph_msg_get(m);
- list_move_tail(&m->list_head, &con->out_sent);
-
- /*
- * only assign outgoing seq # if we haven't sent this message
- * yet. if it is requeued, resend with it's original seq.
- */
- if (m->needs_out_seq) {
- m->hdr.seq = cpu_to_le64(++con->out_seq);
- m->needs_out_seq = false;
-
- if (con->ops->reencode_message)
- con->ops->reencode_message(m);
- }
-
- dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
- m, con->out_seq, le16_to_cpu(m->hdr.type),
- le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
- m->data_length);
- WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
- WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
-
- /* tag + hdr + front + middle */
- con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
- con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
- con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
-
- if (m->middle)
- con_out_kvec_add(con, m->middle->vec.iov_len,
- m->middle->vec.iov_base);
-
- /* fill in hdr crc and finalize hdr */
- crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
- con->out_msg->hdr.crc = cpu_to_le32(crc);
- memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
-
- /* fill in front and middle crc, footer */
- crc = crc32c(0, m->front.iov_base, m->front.iov_len);
- con->out_msg->footer.front_crc = cpu_to_le32(crc);
- if (m->middle) {
- crc = crc32c(0, m->middle->vec.iov_base,
- m->middle->vec.iov_len);
- con->out_msg->footer.middle_crc = cpu_to_le32(crc);
- } else
- con->out_msg->footer.middle_crc = 0;
- dout("%s front_crc %u middle_crc %u\n", __func__,
- le32_to_cpu(con->out_msg->footer.front_crc),
- le32_to_cpu(con->out_msg->footer.middle_crc));
- con->out_msg->footer.flags = 0;
-
- /* is there a data payload? */
- con->out_msg->footer.data_crc = 0;
- if (m->data_length) {
- prepare_message_data(con->out_msg, m->data_length);
- con->out_more = 1; /* data + footer will follow */
- } else {
- /* no, queue up footer too and be done */
- prepare_write_message_footer(con);
- }
-
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
- dout("prepare_write_ack %p %llu -> %llu\n", con,
- con->in_seq_acked, con->in_seq);
- con->in_seq_acked = con->in_seq;
-
- con_out_kvec_reset(con);
-
- con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con_out_kvec_add(con, sizeof (con->out_temp_ack),
- &con->out_temp_ack);
-
- con->out_more = 1; /* more will follow.. eventually.. */
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare to share the seq during handshake
- */
-static void prepare_write_seq(struct ceph_connection *con)
-{
- dout("prepare_write_seq %p %llu -> %llu\n", con,
- con->in_seq_acked, con->in_seq);
- con->in_seq_acked = con->in_seq;
-
- con_out_kvec_reset(con);
-
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con_out_kvec_add(con, sizeof (con->out_temp_ack),
- &con->out_temp_ack);
-
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
- dout("prepare_write_keepalive %p\n", con);
- con_out_kvec_reset(con);
- if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
- struct timespec64 now;
-
- ktime_get_real_ts64(&now);
- con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
- ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
- con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
- &con->out_temp_keepalive2);
- } else {
- con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
- }
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Connection negotiation.
- */
-
-static int get_connect_authorizer(struct ceph_connection *con)
-{
- struct ceph_auth_handshake *auth;
- int auth_proto;
-
- if (!con->ops->get_authorizer) {
- con->auth = NULL;
- con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
- con->out_connect.authorizer_len = 0;
- return 0;
- }
-
- auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
- if (IS_ERR(auth))
- return PTR_ERR(auth);
-
- con->auth = auth;
- con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
- con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
- return 0;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_connection *con)
-{
- con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
- con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
- &con->msgr->my_enc_addr);
-
- con->out_more = 0;
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-static void __prepare_write_connect(struct ceph_connection *con)
-{
- con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
- if (con->auth)
- con_out_kvec_add(con, con->auth->authorizer_buf_len,
- con->auth->authorizer_buf);
-
- con->out_more = 0;
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-static int prepare_write_connect(struct ceph_connection *con)
-{
- unsigned int global_seq = get_global_seq(con->msgr, 0);
- int proto;
- int ret;
-
- switch (con->peer_name.type) {
- case CEPH_ENTITY_TYPE_MON:
- proto = CEPH_MONC_PROTOCOL;
- break;
- case CEPH_ENTITY_TYPE_OSD:
- proto = CEPH_OSDC_PROTOCOL;
- break;
- case CEPH_ENTITY_TYPE_MDS:
- proto = CEPH_MDSC_PROTOCOL;
- break;
- default:
- BUG();
- }
-
- dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
- con->connect_seq, global_seq, proto);
-
- con->out_connect.features =
- cpu_to_le64(from_msgr(con->msgr)->supported_features);
- con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
- con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
- con->out_connect.global_seq = cpu_to_le32(global_seq);
- con->out_connect.protocol_version = cpu_to_le32(proto);
- con->out_connect.flags = 0;
-
- ret = get_connect_authorizer(con);
- if (ret)
- return ret;
-
- __prepare_write_connect(con);
- return 0;
-}
-
-/*
- * write as much of pending kvecs to the socket as we can.
- * 1 -> done
- * 0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
- int ret;
-
- dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
- while (con->out_kvec_bytes > 0) {
- ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
- con->out_kvec_left, con->out_kvec_bytes,
- con->out_more);
- if (ret <= 0)
- goto out;
- con->out_kvec_bytes -= ret;
- if (con->out_kvec_bytes == 0)
- break; /* done */
-
- /* account for full iov entries consumed */
- while (ret >= con->out_kvec_cur->iov_len) {
- BUG_ON(!con->out_kvec_left);
- ret -= con->out_kvec_cur->iov_len;
- con->out_kvec_cur++;
- con->out_kvec_left--;
- }
- /* and for a partially-consumed entry */
- if (ret) {
- con->out_kvec_cur->iov_len -= ret;
- con->out_kvec_cur->iov_base += ret;
- }
- }
- con->out_kvec_left = 0;
- ret = 1;
-out:
- dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
- con->out_kvec_bytes, con->out_kvec_left, ret);
- return ret; /* done! */
-}
-
-static u32 ceph_crc32c_page(u32 crc, struct page *page,
- unsigned int page_offset,
- unsigned int length)
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+ unsigned int length)
{
char *kaddr;
@@ -1542,257 +1128,8 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page,
return crc;
}
-/*
- * Write as much message data payload as we can. If we finish, queue
- * up the footer.
- * 1 -> done, footer is now queued in out_kvec[].
- * 0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_message_data(struct ceph_connection *con)
-{
- struct ceph_msg *msg = con->out_msg;
- struct ceph_msg_data_cursor *cursor = &msg->cursor;
- bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
- int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
- u32 crc;
-
- dout("%s %p msg %p\n", __func__, con, msg);
-
- if (!msg->num_data_items)
- return -EINVAL;
-
- /*
- * Iterate through each page that contains data to be
- * written, and send as much as possible for each.
- *
- * If we are calculating the data crc (the default), we will
- * need to map the page. If we have no pages, they have
- * been revoked, so use the zero page.
- */
- crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
- while (cursor->total_resid) {
- struct page *page;
- size_t page_offset;
- size_t length;
- int ret;
-
- if (!cursor->resid) {
- ceph_msg_data_advance(cursor, 0);
- continue;
- }
-
- page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
- if (length == cursor->total_resid)
- more = MSG_MORE;
- ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
- more);
- if (ret <= 0) {
- if (do_datacrc)
- msg->footer.data_crc = cpu_to_le32(crc);
-
- return ret;
- }
- if (do_datacrc && cursor->need_crc)
- crc = ceph_crc32c_page(crc, page, page_offset, length);
- ceph_msg_data_advance(cursor, (size_t)ret);
- }
-
- dout("%s %p msg %p done\n", __func__, con, msg);
-
- /* prepare and queue up footer, too */
- if (do_datacrc)
- msg->footer.data_crc = cpu_to_le32(crc);
- else
- msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
- con_out_kvec_reset(con);
- prepare_write_message_footer(con);
-
- return 1; /* must return > 0 to indicate success */
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
- int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
- int ret;
-
- dout("%s %p %d left\n", __func__, con, con->out_skip);
- while (con->out_skip > 0) {
- size_t size = min(con->out_skip, (int) PAGE_SIZE);
-
- if (size == con->out_skip)
- more = MSG_MORE;
- ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
- if (ret <= 0)
- goto out;
- con->out_skip -= ret;
- }
- ret = 1;
-out:
- return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
- dout("prepare_read_banner %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
- dout("prepare_read_connect %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
- dout("prepare_read_ack %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_seq(struct ceph_connection *con)
-{
- dout("prepare_read_seq %p\n", con);
- con->in_base_pos = 0;
- con->in_tag = CEPH_MSGR_TAG_SEQ;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
- dout("prepare_read_tag %p\n", con);
- con->in_base_pos = 0;
- con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-static void prepare_read_keepalive_ack(struct ceph_connection *con)
-{
- dout("prepare_read_keepalive_ack %p\n", con);
- con->in_base_pos = 0;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
- dout("prepare_read_message %p\n", con);
- BUG_ON(con->in_msg != NULL);
- con->in_base_pos = 0;
- con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
- return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
- int end, int size, void *object)
-{
- while (con->in_base_pos < end) {
- int left = end - con->in_base_pos;
- int have = size - left;
- int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
- if (ret <= 0)
- return ret;
- con->in_base_pos += ret;
- }
- return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
- int size;
- int end;
- int ret;
-
- dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
- /* peer's banner */
- size = strlen(CEPH_BANNER);
- end = size;
- ret = read_partial(con, end, size, con->in_banner);
- if (ret <= 0)
- goto out;
-
- size = sizeof (con->actual_peer_addr);
- end += size;
- ret = read_partial(con, end, size, &con->actual_peer_addr);
- if (ret <= 0)
- goto out;
- ceph_decode_banner_addr(&con->actual_peer_addr);
-
- size = sizeof (con->peer_addr_for_me);
- end += size;
- ret = read_partial(con, end, size, &con->peer_addr_for_me);
- if (ret <= 0)
- goto out;
- ceph_decode_banner_addr(&con->peer_addr_for_me);
-
-out:
- return ret;
-}
-
-static int read_partial_connect(struct ceph_connection *con)
-{
- int size;
- int end;
- int ret;
-
- dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
- size = sizeof (con->in_reply);
- end = size;
- ret = read_partial(con, end, size, &con->in_reply);
- if (ret <= 0)
- goto out;
-
- if (con->auth) {
- size = le32_to_cpu(con->in_reply.authorizer_len);
- if (size > con->auth->authorizer_reply_buf_len) {
- pr_err("authorizer reply too big: %d > %zu\n", size,
- con->auth->authorizer_reply_buf_len);
- ret = -EINVAL;
- goto out;
- }
-
- end += size;
- ret = read_partial(con, end, size,
- con->auth->authorizer_reply_buf);
- if (ret <= 0)
- goto out;
- }
-
- dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
- con, (int)con->in_reply.tag,
- le32_to_cpu(con->in_reply.connect_seq),
- le32_to_cpu(con->in_reply.global_seq));
-out:
- return ret;
-}
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
- if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
- pr_err("connect to %s got bad banner\n",
- ceph_pr_addr(&con->peer_addr));
- con->error_msg = "protocol error, bad banner";
- return -1;
- }
- return 0;
-}
-
-static bool addr_is_blank(struct ceph_entity_addr *addr)
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
{
struct sockaddr_storage ss = addr->in_addr; /* align */
struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
@@ -1808,7 +1145,7 @@ static bool addr_is_blank(struct ceph_entity_addr *addr)
}
}
-static int addr_port(struct ceph_entity_addr *addr)
+int ceph_addr_port(const struct ceph_entity_addr *addr)
{
switch (get_unaligned(&addr->in_addr.ss_family)) {
case AF_INET:
@@ -1819,7 +1156,7 @@ static int addr_port(struct ceph_entity_addr *addr)
return 0;
}
-static void addr_set_port(struct ceph_entity_addr *addr, int p)
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p)
{
switch (get_unaligned(&addr->in_addr.ss_family)) {
case AF_INET:
@@ -1977,8 +1314,17 @@ int ceph_parse_ips(const char *c, const char *end,
port = CEPH_MON_PORT;
}
- addr_set_port(&addr[i], port);
+ ceph_addr_set_port(&addr[i], port);
+ /*
+ * We want the type to be set according to ms_mode
+ * option, but options are normally parsed after mon
+ * addresses. Rather than complicating parsing, set
+ * to LEGACY and override in build_initial_monmap()
+ * for mon addresses and ceph_messenger_init() for
+ * ip option.
+ */
addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ addr[i].nonce = 0;
dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
@@ -2000,521 +1346,12 @@ bad:
return ret;
}
-static int process_banner(struct ceph_connection *con)
-{
- dout("process_banner on %p\n", con);
-
- if (verify_hello(con) < 0)
- return -1;
-
- /*
- * Make sure the other end is who we wanted. note that the other
- * end may not yet know their ip address, so if it's 0.0.0.0, give
- * them the benefit of the doubt.
- */
- if (memcmp(&con->peer_addr, &con->actual_peer_addr,
- sizeof(con->peer_addr)) != 0 &&
- !(addr_is_blank(&con->actual_peer_addr) &&
- con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
- pr_warn("wrong peer, want %s/%d, got %s/%d\n",
- ceph_pr_addr(&con->peer_addr),
- (int)le32_to_cpu(con->peer_addr.nonce),
- ceph_pr_addr(&con->actual_peer_addr),
- (int)le32_to_cpu(con->actual_peer_addr.nonce));
- con->error_msg = "wrong peer at address";
- return -1;
- }
-
- /*
- * did we learn our address?
- */
- if (addr_is_blank(&con->msgr->inst.addr)) {
- int port = addr_port(&con->msgr->inst.addr);
-
- memcpy(&con->msgr->inst.addr.in_addr,
- &con->peer_addr_for_me.in_addr,
- sizeof(con->peer_addr_for_me.in_addr));
- addr_set_port(&con->msgr->inst.addr, port);
- encode_my_addr(con->msgr);
- dout("process_banner learned my addr is %s\n",
- ceph_pr_addr(&con->msgr->inst.addr));
- }
-
- return 0;
-}
-
-static int process_connect(struct ceph_connection *con)
-{
- u64 sup_feat = from_msgr(con->msgr)->supported_features;
- u64 req_feat = from_msgr(con->msgr)->required_features;
- u64 server_feat = le64_to_cpu(con->in_reply.features);
- int ret;
-
- dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
- if (con->auth) {
- int len = le32_to_cpu(con->in_reply.authorizer_len);
-
- /*
- * Any connection that defines ->get_authorizer()
- * should also define ->add_authorizer_challenge() and
- * ->verify_authorizer_reply().
- *
- * See get_connect_authorizer().
- */
- if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
- ret = con->ops->add_authorizer_challenge(
- con, con->auth->authorizer_reply_buf, len);
- if (ret < 0)
- return ret;
-
- con_out_kvec_reset(con);
- __prepare_write_connect(con);
- prepare_read_connect(con);
- return 0;
- }
-
- if (len) {
- ret = con->ops->verify_authorizer_reply(con);
- if (ret < 0) {
- con->error_msg = "bad authorize reply";
- return ret;
- }
- }
- }
-
- switch (con->in_reply.tag) {
- case CEPH_MSGR_TAG_FEATURES:
- pr_err("%s%lld %s feature set mismatch,"
- " my %llx < server's %llx, missing %llx\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr),
- sup_feat, server_feat, server_feat & ~sup_feat);
- con->error_msg = "missing required protocol features";
- reset_connection(con);
- return -1;
-
- case CEPH_MSGR_TAG_BADPROTOVER:
- pr_err("%s%lld %s protocol version mismatch,"
- " my %d != server's %d\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr),
- le32_to_cpu(con->out_connect.protocol_version),
- le32_to_cpu(con->in_reply.protocol_version));
- con->error_msg = "protocol version mismatch";
- reset_connection(con);
- return -1;
-
- case CEPH_MSGR_TAG_BADAUTHORIZER:
- con->auth_retry++;
- dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
- con->auth_retry);
- if (con->auth_retry == 2) {
- con->error_msg = "connect authorization failure";
- return -1;
- }
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_RESETSESSION:
- /*
- * If we connected with a large connect_seq but the peer
- * has no record of a session with us (no connection, or
- * connect_seq == 0), they will send RESETSESION to indicate
- * that they must have reset their session, and may have
- * dropped messages.
- */
- dout("process_connect got RESET peer seq %u\n",
- le32_to_cpu(con->in_reply.connect_seq));
- pr_err("%s%lld %s connection reset\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr));
- reset_connection(con);
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
-
- /* Tell ceph about it. */
- mutex_unlock(&con->mutex);
- pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
- if (con->ops->peer_reset)
- con->ops->peer_reset(con);
- mutex_lock(&con->mutex);
- if (con->state != CON_STATE_NEGOTIATING)
- return -EAGAIN;
- break;
-
- case CEPH_MSGR_TAG_RETRY_SESSION:
- /*
- * If we sent a smaller connect_seq than the peer has, try
- * again with a larger value.
- */
- dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
- le32_to_cpu(con->out_connect.connect_seq),
- le32_to_cpu(con->in_reply.connect_seq));
- con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_RETRY_GLOBAL:
- /*
- * If we sent a smaller global_seq than the peer has, try
- * again with a larger value.
- */
- dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
- con->peer_global_seq,
- le32_to_cpu(con->in_reply.global_seq));
- get_global_seq(con->msgr,
- le32_to_cpu(con->in_reply.global_seq));
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_SEQ:
- case CEPH_MSGR_TAG_READY:
- if (req_feat & ~server_feat) {
- pr_err("%s%lld %s protocol feature mismatch,"
- " my required %llx > server's %llx, need %llx\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr),
- req_feat, server_feat, req_feat & ~server_feat);
- con->error_msg = "missing required protocol features";
- reset_connection(con);
- return -1;
- }
-
- WARN_ON(con->state != CON_STATE_NEGOTIATING);
- con->state = CON_STATE_OPEN;
- con->auth_retry = 0; /* we authenticated; clear flag */
- con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
- con->connect_seq++;
- con->peer_features = server_feat;
- dout("process_connect got READY gseq %d cseq %d (%d)\n",
- con->peer_global_seq,
- le32_to_cpu(con->in_reply.connect_seq),
- con->connect_seq);
- WARN_ON(con->connect_seq !=
- le32_to_cpu(con->in_reply.connect_seq));
-
- if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
- con_flag_set(con, CON_FLAG_LOSSYTX);
-
- con->delay = 0; /* reset backoff memory */
-
- if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
- prepare_write_seq(con);
- prepare_read_seq(con);
- } else {
- prepare_read_tag(con);
- }
- break;
-
- case CEPH_MSGR_TAG_WAIT:
- /*
- * If there is a connection race (we are opening
- * connections to each other), one of us may just have
- * to WAIT. This shouldn't happen if we are the
- * client.
- */
- con->error_msg = "protocol error, got WAIT as client";
- return -1;
-
- default:
- con->error_msg = "protocol error, garbage tag during connect";
- return -1;
- }
- return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
- int size = sizeof (con->in_temp_ack);
- int end = size;
-
- return read_partial(con, end, size, &con->in_temp_ack);
-}
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
- struct ceph_msg *m;
- u64 ack = le64_to_cpu(con->in_temp_ack);
- u64 seq;
- bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ);
- struct list_head *list = reconnect ? &con->out_queue : &con->out_sent;
-
- /*
- * In the reconnect case, con_fault() has requeued messages
- * in out_sent. We should cleanup old messages according to
- * the reconnect seq.
- */
- while (!list_empty(list)) {
- m = list_first_entry(list, struct ceph_msg, list_head);
- if (reconnect && m->needs_out_seq)
- break;
- seq = le64_to_cpu(m->hdr.seq);
- if (seq > ack)
- break;
- dout("got ack for seq %llu type %d at %p\n", seq,
- le16_to_cpu(m->hdr.type), m);
- m->ack_stamp = jiffies;
- ceph_msg_remove(m);
- }
-
- prepare_read_tag(con);
-}
-
-
-static int read_partial_message_section(struct ceph_connection *con,
- struct kvec *section,
- unsigned int sec_len, u32 *crc)
-{
- int ret, left;
-
- BUG_ON(!section);
-
- while (section->iov_len < sec_len) {
- BUG_ON(section->iov_base == NULL);
- left = sec_len - section->iov_len;
- ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
- section->iov_len, left);
- if (ret <= 0)
- return ret;
- section->iov_len += ret;
- }
- if (section->iov_len == sec_len)
- *crc = crc32c(0, section->iov_base, section->iov_len);
-
- return 1;
-}
-
-static int read_partial_msg_data(struct ceph_connection *con)
-{
- struct ceph_msg *msg = con->in_msg;
- struct ceph_msg_data_cursor *cursor = &msg->cursor;
- bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
- struct page *page;
- size_t page_offset;
- size_t length;
- u32 crc = 0;
- int ret;
-
- if (!msg->num_data_items)
- return -EIO;
-
- if (do_datacrc)
- crc = con->in_data_crc;
- while (cursor->total_resid) {
- if (!cursor->resid) {
- ceph_msg_data_advance(cursor, 0);
- continue;
- }
-
- page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
- ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
- if (ret <= 0) {
- if (do_datacrc)
- con->in_data_crc = crc;
-
- return ret;
- }
-
- if (do_datacrc)
- crc = ceph_crc32c_page(crc, page, page_offset, ret);
- ceph_msg_data_advance(cursor, (size_t)ret);
- }
- if (do_datacrc)
- con->in_data_crc = crc;
-
- return 1; /* must return > 0 to indicate success */
-}
-
-/*
- * read (part of) a message.
- */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
-
-static int read_partial_message(struct ceph_connection *con)
-{
- struct ceph_msg *m = con->in_msg;
- int size;
- int end;
- int ret;
- unsigned int front_len, middle_len, data_len;
- bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
- bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
- u64 seq;
- u32 crc;
-
- dout("read_partial_message con %p msg %p\n", con, m);
-
- /* header */
- size = sizeof (con->in_hdr);
- end = size;
- ret = read_partial(con, end, size, &con->in_hdr);
- if (ret <= 0)
- return ret;
-
- crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
- if (cpu_to_le32(crc) != con->in_hdr.crc) {
- pr_err("read_partial_message bad hdr crc %u != expected %u\n",
- crc, con->in_hdr.crc);
- return -EBADMSG;
- }
-
- front_len = le32_to_cpu(con->in_hdr.front_len);
- if (front_len > CEPH_MSG_MAX_FRONT_LEN)
- return -EIO;
- middle_len = le32_to_cpu(con->in_hdr.middle_len);
- if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
- return -EIO;
- data_len = le32_to_cpu(con->in_hdr.data_len);
- if (data_len > CEPH_MSG_MAX_DATA_LEN)
- return -EIO;
-
- /* verify seq# */
- seq = le64_to_cpu(con->in_hdr.seq);
- if ((s64)seq - (s64)con->in_seq < 1) {
- pr_info("skipping %s%lld %s seq %lld expected %lld\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr),
- seq, con->in_seq + 1);
- con->in_base_pos = -front_len - middle_len - data_len -
- sizeof_footer(con);
- con->in_tag = CEPH_MSGR_TAG_READY;
- return 1;
- } else if ((s64)seq - (s64)con->in_seq > 1) {
- pr_err("read_partial_message bad seq %lld expected %lld\n",
- seq, con->in_seq + 1);
- con->error_msg = "bad message sequence # for incoming message";
- return -EBADE;
- }
-
- /* allocate message? */
- if (!con->in_msg) {
- int skip = 0;
-
- dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
- front_len, data_len);
- ret = ceph_con_in_msg_alloc(con, &skip);
- if (ret < 0)
- return ret;
-
- BUG_ON(!con->in_msg ^ skip);
- if (skip) {
- /* skip this message */
- dout("alloc_msg said skip message\n");
- con->in_base_pos = -front_len - middle_len - data_len -
- sizeof_footer(con);
- con->in_tag = CEPH_MSGR_TAG_READY;
- con->in_seq++;
- return 1;
- }
-
- BUG_ON(!con->in_msg);
- BUG_ON(con->in_msg->con != con);
- m = con->in_msg;
- m->front.iov_len = 0; /* haven't read it yet */
- if (m->middle)
- m->middle->vec.iov_len = 0;
-
- /* prepare for data payload, if any */
-
- if (data_len)
- prepare_message_data(con->in_msg, data_len);
- }
-
- /* front */
- ret = read_partial_message_section(con, &m->front, front_len,
- &con->in_front_crc);
- if (ret <= 0)
- return ret;
-
- /* middle */
- if (m->middle) {
- ret = read_partial_message_section(con, &m->middle->vec,
- middle_len,
- &con->in_middle_crc);
- if (ret <= 0)
- return ret;
- }
-
- /* (page) data */
- if (data_len) {
- ret = read_partial_msg_data(con);
- if (ret <= 0)
- return ret;
- }
-
- /* footer */
- size = sizeof_footer(con);
- end += size;
- ret = read_partial(con, end, size, &m->footer);
- if (ret <= 0)
- return ret;
-
- if (!need_sign) {
- m->footer.flags = m->old_footer.flags;
- m->footer.sig = 0;
- }
-
- dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
- m, front_len, m->footer.front_crc, middle_len,
- m->footer.middle_crc, data_len, m->footer.data_crc);
-
- /* crc ok? */
- if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
- pr_err("read_partial_message %p front crc %u != exp. %u\n",
- m, con->in_front_crc, m->footer.front_crc);
- return -EBADMSG;
- }
- if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
- pr_err("read_partial_message %p middle crc %u != exp %u\n",
- m, con->in_middle_crc, m->footer.middle_crc);
- return -EBADMSG;
- }
- if (do_datacrc &&
- (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
- con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
- pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
- con->in_data_crc, le32_to_cpu(m->footer.data_crc));
- return -EBADMSG;
- }
-
- if (need_sign && con->ops->check_message_signature &&
- con->ops->check_message_signature(m)) {
- pr_err("read_partial_message %p signature check failed\n", m);
- return -EBADMSG;
- }
-
- return 1; /* done! */
-}
-
/*
* Process message. This happens in the worker thread. The callback should
* be careful not to do anything that waits on other incoming messages or it
* may deadlock.
*/
-static void process_message(struct ceph_connection *con)
+void ceph_con_process_message(struct ceph_connection *con)
{
struct ceph_msg *msg = con->in_msg;
@@ -2528,12 +1365,13 @@ static void process_message(struct ceph_connection *con)
con->in_seq++;
mutex_unlock(&con->mutex);
- dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+ dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n",
msg, le64_to_cpu(msg->hdr.seq),
ENTITY_NAME(msg->hdr.src),
le16_to_cpu(msg->hdr.type),
ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.middle_len),
le32_to_cpu(msg->hdr.data_len),
con->in_front_crc, con->in_middle_crc, con->in_data_crc);
con->ops->dispatch(con, msg);
@@ -2541,264 +1379,6 @@ static void process_message(struct ceph_connection *con)
mutex_lock(&con->mutex);
}
-static int read_keepalive_ack(struct ceph_connection *con)
-{
- struct ceph_timespec ceph_ts;
- size_t size = sizeof(ceph_ts);
- int ret = read_partial(con, size, size, &ceph_ts);
- if (ret <= 0)
- return ret;
- ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
- prepare_read_tag(con);
- return 1;
-}
-
-/*
- * Write something to the socket. Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
- int ret = 1;
-
- dout("try_write start %p state %lu\n", con, con->state);
- if (con->state != CON_STATE_PREOPEN &&
- con->state != CON_STATE_CONNECTING &&
- con->state != CON_STATE_NEGOTIATING &&
- con->state != CON_STATE_OPEN)
- return 0;
-
- /* open the socket first? */
- if (con->state == CON_STATE_PREOPEN) {
- BUG_ON(con->sock);
- con->state = CON_STATE_CONNECTING;
-
- con_out_kvec_reset(con);
- prepare_write_banner(con);
- prepare_read_banner(con);
-
- BUG_ON(con->in_msg);
- con->in_tag = CEPH_MSGR_TAG_READY;
- dout("try_write initiating connect on %p new state %lu\n",
- con, con->state);
- ret = ceph_tcp_connect(con);
- if (ret < 0) {
- con->error_msg = "connect error";
- goto out;
- }
- }
-
-more:
- dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
- BUG_ON(!con->sock);
-
- /* kvec data queued? */
- if (con->out_kvec_left) {
- ret = write_partial_kvec(con);
- if (ret <= 0)
- goto out;
- }
- if (con->out_skip) {
- ret = write_partial_skip(con);
- if (ret <= 0)
- goto out;
- }
-
- /* msg pages? */
- if (con->out_msg) {
- if (con->out_msg_done) {
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL; /* we're done with this one */
- goto do_next;
- }
-
- ret = write_partial_message_data(con);
- if (ret == 1)
- goto more; /* we need to send the footer, too! */
- if (ret == 0)
- goto out;
- if (ret < 0) {
- dout("try_write write_partial_message_data err %d\n",
- ret);
- goto out;
- }
- }
-
-do_next:
- if (con->state == CON_STATE_OPEN) {
- if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
- prepare_write_keepalive(con);
- goto more;
- }
- /* is anything else pending? */
- if (!list_empty(&con->out_queue)) {
- prepare_write_message(con);
- goto more;
- }
- if (con->in_seq > con->in_seq_acked) {
- prepare_write_ack(con);
- goto more;
- }
- }
-
- /* Nothing to do! */
- con_flag_clear(con, CON_FLAG_WRITE_PENDING);
- dout("try_write nothing else to write.\n");
- ret = 0;
-out:
- dout("try_write done on %p ret %d\n", con, ret);
- return ret;
-}
-
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
- int ret = -1;
-
-more:
- dout("try_read start on %p state %lu\n", con, con->state);
- if (con->state != CON_STATE_CONNECTING &&
- con->state != CON_STATE_NEGOTIATING &&
- con->state != CON_STATE_OPEN)
- return 0;
-
- BUG_ON(!con->sock);
-
- dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
- con->in_base_pos);
-
- if (con->state == CON_STATE_CONNECTING) {
- dout("try_read connecting\n");
- ret = read_partial_banner(con);
- if (ret <= 0)
- goto out;
- ret = process_banner(con);
- if (ret < 0)
- goto out;
-
- con->state = CON_STATE_NEGOTIATING;
-
- /*
- * Received banner is good, exchange connection info.
- * Do not reset out_kvec, as sending our banner raced
- * with receiving peer banner after connect completed.
- */
- ret = prepare_write_connect(con);
- if (ret < 0)
- goto out;
- prepare_read_connect(con);
-
- /* Send connection info before awaiting response */
- goto out;
- }
-
- if (con->state == CON_STATE_NEGOTIATING) {
- dout("try_read negotiating\n");
- ret = read_partial_connect(con);
- if (ret <= 0)
- goto out;
- ret = process_connect(con);
- if (ret < 0)
- goto out;
- goto more;
- }
-
- WARN_ON(con->state != CON_STATE_OPEN);
-
- if (con->in_base_pos < 0) {
- /*
- * skipping + discarding content.
- */
- ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
- if (ret <= 0)
- goto out;
- dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
- con->in_base_pos += ret;
- if (con->in_base_pos)
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_READY) {
- /*
- * what's next?
- */
- ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
- if (ret <= 0)
- goto out;
- dout("try_read got tag %d\n", (int)con->in_tag);
- switch (con->in_tag) {
- case CEPH_MSGR_TAG_MSG:
- prepare_read_message(con);
- break;
- case CEPH_MSGR_TAG_ACK:
- prepare_read_ack(con);
- break;
- case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
- prepare_read_keepalive_ack(con);
- break;
- case CEPH_MSGR_TAG_CLOSE:
- con_close_socket(con);
- con->state = CON_STATE_CLOSED;
- goto out;
- default:
- goto bad_tag;
- }
- }
- if (con->in_tag == CEPH_MSGR_TAG_MSG) {
- ret = read_partial_message(con);
- if (ret <= 0) {
- switch (ret) {
- case -EBADMSG:
- con->error_msg = "bad crc/signature";
- fallthrough;
- case -EBADE:
- ret = -EIO;
- break;
- case -EIO:
- con->error_msg = "io error";
- break;
- }
- goto out;
- }
- if (con->in_tag == CEPH_MSGR_TAG_READY)
- goto more;
- process_message(con);
- if (con->state == CON_STATE_OPEN)
- prepare_read_tag(con);
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_ACK ||
- con->in_tag == CEPH_MSGR_TAG_SEQ) {
- /*
- * the final handshake seq exchange is semantically
- * equivalent to an ACK
- */
- ret = read_partial_ack(con);
- if (ret <= 0)
- goto out;
- process_ack(con);
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
- ret = read_keepalive_ack(con);
- if (ret <= 0)
- goto out;
- goto more;
- }
-
-out:
- dout("try_read done on %p ret %d\n", con, ret);
- return ret;
-
-bad_tag:
- pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
- con->error_msg = "protocol error, garbage tag";
- ret = -1;
- goto out;
-}
-
-
/*
* Atomically queue work on a connection after the specified delay.
* Bump @con reference to avoid races with connection teardown.
@@ -2811,13 +1391,16 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
return -ENOENT;
}
+ if (delay >= HZ)
+ delay = round_jiffies_relative(delay);
+
+ dout("%s %p %lu\n", __func__, con, delay);
if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
dout("%s %p - already queued\n", __func__, con);
con->ops->put(con);
return -EBUSY;
}
- dout("%s %p %lu\n", __func__, con, delay);
return 0;
}
@@ -2836,27 +1419,30 @@ static void cancel_con(struct ceph_connection *con)
static bool con_sock_closed(struct ceph_connection *con)
{
- if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
+ if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
return false;
#define CASE(x) \
- case CON_STATE_ ## x: \
+ case CEPH_CON_S_ ## x: \
con->error_msg = "socket closed (con state " #x ")"; \
break;
switch (con->state) {
CASE(CLOSED);
CASE(PREOPEN);
- CASE(CONNECTING);
- CASE(NEGOTIATING);
+ CASE(V1_BANNER);
+ CASE(V1_CONNECT_MSG);
+ CASE(V2_BANNER_PREFIX);
+ CASE(V2_BANNER_PAYLOAD);
+ CASE(V2_HELLO);
+ CASE(V2_AUTH);
+ CASE(V2_AUTH_SIGNATURE);
+ CASE(V2_SESSION_CONNECT);
+ CASE(V2_SESSION_RECONNECT);
CASE(OPEN);
CASE(STANDBY);
default:
- pr_warn("%s con %p unrecognized state %lu\n",
- __func__, con, con->state);
- con->error_msg = "unrecognized con state";
BUG();
- break;
}
#undef CASE
@@ -2867,15 +1453,15 @@ static bool con_backoff(struct ceph_connection *con)
{
int ret;
- if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
+ if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
return false;
- ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+ ret = queue_con_delay(con, con->delay);
if (ret) {
dout("%s: con %p FAILED to back off %lu\n", __func__,
con, con->delay);
BUG_ON(ret == -ENOENT);
- con_flag_set(con, CON_FLAG_BACKOFF);
+ ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
}
return true;
@@ -2891,11 +1477,11 @@ static void con_fault_finish(struct ceph_connection *con)
* in case we faulted due to authentication, invalidate our
* current tickets so that we can get new ones.
*/
- if (con->auth_retry) {
- dout("auth_retry %d, invalidating\n", con->auth_retry);
+ if (con->v1.auth_retry) {
+ dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
if (con->ops->invalidate_authorizer)
con->ops->invalidate_authorizer(con);
- con->auth_retry = 0;
+ con->v1.auth_retry = 0;
}
if (con->ops->fault)
@@ -2923,21 +1509,24 @@ static void ceph_con_workfn(struct work_struct *work)
dout("%s: con %p BACKOFF\n", __func__, con);
break;
}
- if (con->state == CON_STATE_STANDBY) {
+ if (con->state == CEPH_CON_S_STANDBY) {
dout("%s: con %p STANDBY\n", __func__, con);
break;
}
- if (con->state == CON_STATE_CLOSED) {
+ if (con->state == CEPH_CON_S_CLOSED) {
dout("%s: con %p CLOSED\n", __func__, con);
BUG_ON(con->sock);
break;
}
- if (con->state == CON_STATE_PREOPEN) {
+ if (con->state == CEPH_CON_S_PREOPEN) {
dout("%s: con %p PREOPEN\n", __func__, con);
BUG_ON(con->sock);
}
- ret = try_read(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ret = ceph_con_v2_try_read(con);
+ else
+ ret = ceph_con_v1_try_read(con);
if (ret < 0) {
if (ret == -EAGAIN)
continue;
@@ -2947,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work)
break;
}
- ret = try_write(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ret = ceph_con_v2_try_write(con);
+ else
+ ret = ceph_con_v1_try_write(con);
if (ret < 0) {
if (ret == -EAGAIN)
continue;
@@ -2974,59 +1566,54 @@ static void ceph_con_workfn(struct work_struct *work)
*/
static void con_fault(struct ceph_connection *con)
{
- dout("fault %p state %lu to peer %s\n",
+ dout("fault %p state %d to peer %s\n",
con, con->state, ceph_pr_addr(&con->peer_addr));
pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
ceph_pr_addr(&con->peer_addr), con->error_msg);
con->error_msg = NULL;
- WARN_ON(con->state != CON_STATE_CONNECTING &&
- con->state != CON_STATE_NEGOTIATING &&
- con->state != CON_STATE_OPEN);
+ WARN_ON(con->state == CEPH_CON_S_STANDBY ||
+ con->state == CEPH_CON_S_CLOSED);
- con_close_socket(con);
+ ceph_con_reset_protocol(con);
- if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
+ if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
dout("fault on LOSSYTX channel, marking CLOSED\n");
- con->state = CON_STATE_CLOSED;
+ con->state = CEPH_CON_S_CLOSED;
return;
}
- if (con->in_msg) {
- BUG_ON(con->in_msg->con != con);
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- }
-
/* Requeue anything that hasn't been acked */
list_splice_init(&con->out_sent, &con->out_queue);
/* If there are no messages queued or keepalive pending, place
* the connection in a STANDBY state */
if (list_empty(&con->out_queue) &&
- !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
+ !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
- con_flag_clear(con, CON_FLAG_WRITE_PENDING);
- con->state = CON_STATE_STANDBY;
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ con->state = CEPH_CON_S_STANDBY;
} else {
/* retry after a delay. */
- con->state = CON_STATE_PREOPEN;
- if (con->delay == 0)
+ con->state = CEPH_CON_S_PREOPEN;
+ if (!con->delay) {
con->delay = BASE_DELAY_INTERVAL;
- else if (con->delay < MAX_DELAY_INTERVAL)
+ } else if (con->delay < MAX_DELAY_INTERVAL) {
con->delay *= 2;
- con_flag_set(con, CON_FLAG_BACKOFF);
+ if (con->delay > MAX_DELAY_INTERVAL)
+ con->delay = MAX_DELAY_INTERVAL;
+ }
+ ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
queue_con(con);
}
}
-
void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
{
u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
msgr->inst.addr.nonce = cpu_to_le32(nonce);
- encode_my_addr(msgr);
+ ceph_encode_my_addr(msgr);
}
/*
@@ -3037,26 +1624,35 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
{
spin_lock_init(&msgr->global_seq_lock);
- if (myaddr)
- msgr->inst.addr = *myaddr;
+ if (myaddr) {
+ memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr,
+ sizeof(msgr->inst.addr.in_addr));
+ ceph_addr_set_port(&msgr->inst.addr, 0);
+ }
- /* select a random nonce */
- msgr->inst.addr.type = 0;
- get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
- encode_my_addr(msgr);
+ /*
+ * Since nautilus, clients are identified using type ANY.
+ * For msgr1, ceph_encode_banner_addr() munges it to NONE.
+ */
+ msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY;
+
+ /* generate a random non-zero nonce */
+ do {
+ get_random_bytes(&msgr->inst.addr.nonce,
+ sizeof(msgr->inst.addr.nonce));
+ } while (!msgr->inst.addr.nonce);
+ ceph_encode_my_addr(msgr);
atomic_set(&msgr->stopping, 0);
write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
dout("%s %p\n", __func__, msgr);
}
-EXPORT_SYMBOL(ceph_messenger_init);
void ceph_messenger_fini(struct ceph_messenger *msgr)
{
put_net(read_pnet(&msgr->net));
}
-EXPORT_SYMBOL(ceph_messenger_fini);
static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
{
@@ -3070,17 +1666,19 @@ static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
static void clear_standby(struct ceph_connection *con)
{
/* come back from STANDBY? */
- if (con->state == CON_STATE_STANDBY) {
+ if (con->state == CEPH_CON_S_STANDBY) {
dout("clear_standby %p and ++connect_seq\n", con);
- con->state = CON_STATE_PREOPEN;
- con->connect_seq++;
- WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
- WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
+ con->state = CEPH_CON_S_PREOPEN;
+ con->v1.connect_seq++;
+ WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+ WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
}
}
/*
* Queue up an outgoing message on the given connection.
+ *
+ * Consumes a ref on @msg.
*/
void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
{
@@ -3091,7 +1689,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
mutex_lock(&con->mutex);
- if (con->state == CON_STATE_CLOSED) {
+ if (con->state == CEPH_CON_S_CLOSED) {
dout("con_send %p closed, dropping %p\n", con, msg);
ceph_msg_put(msg);
mutex_unlock(&con->mutex);
@@ -3114,7 +1712,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
/* if there wasn't anything waiting to send before, queue
* new work */
- if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
queue_con(con);
}
EXPORT_SYMBOL(ceph_con_send);
@@ -3132,36 +1730,30 @@ void ceph_msg_revoke(struct ceph_msg *msg)
}
mutex_lock(&con->mutex);
- if (!list_empty(&msg->list_head)) {
- dout("%s %p msg %p - was on queue\n", __func__, con, msg);
- list_del_init(&msg->list_head);
- msg->hdr.seq = 0;
-
- ceph_msg_put(msg);
+ if (list_empty(&msg->list_head)) {
+ WARN_ON(con->out_msg == msg);
+ dout("%s con %p msg %p not linked\n", __func__, con, msg);
+ mutex_unlock(&con->mutex);
+ return;
}
+
+ dout("%s con %p msg %p was linked\n", __func__, con, msg);
+ msg->hdr.seq = 0;
+ ceph_msg_remove(msg);
+
if (con->out_msg == msg) {
- BUG_ON(con->out_skip);
- /* footer */
- if (con->out_msg_done) {
- con->out_skip += con_out_kvec_skip(con);
- } else {
- BUG_ON(!msg->data_length);
- con->out_skip += sizeof_footer(con);
- }
- /* data, middle, front */
- if (msg->data_length)
- con->out_skip += msg->cursor.total_resid;
- if (msg->middle)
- con->out_skip += con_out_kvec_skip(con);
- con->out_skip += con_out_kvec_skip(con);
-
- dout("%s %p msg %p - was sending, will write %d skip %d\n",
- __func__, con, msg, con->out_kvec_bytes, con->out_skip);
- msg->hdr.seq = 0;
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+ dout("%s con %p msg %p was sending\n", __func__, con, msg);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_revoke(con);
+ else
+ ceph_con_v1_revoke(con);
+ ceph_msg_put(con->out_msg);
con->out_msg = NULL;
- ceph_msg_put(msg);
+ } else {
+ dout("%s con %p msg %p not current, out_msg %p\n", __func__,
+ con, msg, con->out_msg);
}
-
mutex_unlock(&con->mutex);
}
@@ -3179,25 +1771,17 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
mutex_lock(&con->mutex);
if (con->in_msg == msg) {
- unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
- unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
- unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
-
- /* skip rest of message */
- dout("%s %p msg %p revoked\n", __func__, con, msg);
- con->in_base_pos = con->in_base_pos -
- sizeof(struct ceph_msg_header) -
- front_len -
- middle_len -
- data_len -
- sizeof(struct ceph_msg_footer);
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+ dout("%s con %p msg %p was recving\n", __func__, con, msg);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_revoke_incoming(con);
+ else
+ ceph_con_v1_revoke_incoming(con);
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
- con->in_tag = CEPH_MSGR_TAG_READY;
- con->in_seq++;
} else {
- dout("%s %p in_msg %p msg %p no-op\n",
- __func__, con, con->in_msg, msg);
+ dout("%s con %p msg %p not current, in_msg %p\n", __func__,
+ con, msg, con->in_msg);
}
mutex_unlock(&con->mutex);
}
@@ -3210,10 +1794,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
dout("con_keepalive %p\n", con);
mutex_lock(&con->mutex);
clear_standby(con);
- con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING);
+ ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
mutex_unlock(&con->mutex);
- if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
queue_con(con);
}
EXPORT_SYMBOL(ceph_con_keepalive);
@@ -3419,9 +2003,9 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
* On error (ENOMEM, EAGAIN, ...),
* - con->in_msg == NULL
*/
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+ struct ceph_msg_header *hdr, int *skip)
{
- struct ceph_msg_header *hdr = &con->in_hdr;
int middle_len = le32_to_cpu(hdr->middle_len);
struct ceph_msg *msg;
int ret = 0;
@@ -3432,7 +2016,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
mutex_unlock(&con->mutex);
msg = con->ops->alloc_msg(con, hdr, skip);
mutex_lock(&con->mutex);
- if (con->state != CON_STATE_OPEN) {
+ if (con->state != CEPH_CON_S_OPEN) {
if (msg)
ceph_msg_put(msg);
return -EAGAIN;
@@ -3453,7 +2037,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
con->error_msg = "error allocating memory for incoming message";
return -ENOMEM;
}
- memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+ memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr));
if (middle_len && !con->in_msg->middle) {
ret = ceph_alloc_middle(con, con->in_msg);
@@ -3466,6 +2050,39 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
return ret;
}
+void ceph_con_get_out_msg(struct ceph_connection *con)
+{
+ struct ceph_msg *msg;
+
+ BUG_ON(list_empty(&con->out_queue));
+ msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
+ WARN_ON(msg->con != con);
+
+ /*
+ * Put the message on "sent" list using a ref from ceph_con_send().
+ * It is put when the message is acked or revoked.
+ */
+ list_move_tail(&msg->list_head, &con->out_sent);
+
+ /*
+ * Only assign outgoing seq # if we haven't sent this message
+ * yet. If it is requeued, resend with it's original seq.
+ */
+ if (msg->needs_out_seq) {
+ msg->hdr.seq = cpu_to_le64(++con->out_seq);
+ msg->needs_out_seq = false;
+
+ if (con->ops->reencode_message)
+ con->ops->reencode_message(msg);
+ }
+
+ /*
+ * Get a ref for out_msg. It is put when we are done sending the
+ * message or in case of a fault.
+ */
+ WARN_ON(con->out_msg);
+ con->out_msg = ceph_msg_get(msg);
+}
/*
* Free a generically kmalloc'd message.
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
new file mode 100644
index 000000000000..2cb5ffdf071a
--- /dev/null
+++ b/net/ceph/messenger_v1.c
@@ -0,0 +1,1506 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
+
+/*
+ * If @buf is NULL, discard up to @len bytes.
+ */
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+ struct kvec iov = {buf, len};
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ if (!buf)
+ msg.msg_flags |= MSG_TRUNC;
+
+ iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
+ r = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+ int page_offset, size_t length)
+{
+ struct bio_vec bvec = {
+ .bv_page = page,
+ .bv_offset = page_offset,
+ .bv_len = length
+ };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ BUG_ON(page_offset + length > PAGE_SIZE);
+ iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
+ r = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+/*
+ * write something. @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+ size_t kvlen, size_t len, bool more)
+{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ if (more)
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
+
+ r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+/*
+ * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
+ */
+static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int more)
+{
+ ssize_t (*sendpage)(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags);
+ int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+ int ret;
+
+ /*
+ * sendpage cannot properly handle pages with page_count == 0,
+ * we need to fall back to sendmsg if that's the case.
+ *
+ * Same goes for slab pages: skb_can_coalesce() allows
+ * coalescing neighboring slab objects into a single frag which
+ * triggers one of hardened usercopy checks.
+ */
+ if (sendpage_ok(page))
+ sendpage = sock->ops->sendpage;
+ else
+ sendpage = sock_no_sendpage;
+
+ ret = sendpage(sock, page, offset, size, flags);
+ if (ret == -EAGAIN)
+ ret = 0;
+
+ return ret;
+}
+
+static void con_out_kvec_reset(struct ceph_connection *con)
+{
+ BUG_ON(con->v1.out_skip);
+
+ con->v1.out_kvec_left = 0;
+ con->v1.out_kvec_bytes = 0;
+ con->v1.out_kvec_cur = &con->v1.out_kvec[0];
+}
+
+static void con_out_kvec_add(struct ceph_connection *con,
+ size_t size, void *data)
+{
+ int index = con->v1.out_kvec_left;
+
+ BUG_ON(con->v1.out_skip);
+ BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec));
+
+ con->v1.out_kvec[index].iov_len = size;
+ con->v1.out_kvec[index].iov_base = data;
+ con->v1.out_kvec_left++;
+ con->v1.out_kvec_bytes += size;
+}
+
+/*
+ * Chop off a kvec from the end. Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+ int skip = 0;
+
+ if (con->v1.out_kvec_bytes > 0) {
+ skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len;
+ BUG_ON(con->v1.out_kvec_bytes < skip);
+ BUG_ON(!con->v1.out_kvec_left);
+ con->v1.out_kvec_bytes -= skip;
+ con->v1.out_kvec_left--;
+ }
+
+ return skip;
+}
+
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+ return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+ sizeof(struct ceph_msg_footer) :
+ sizeof(struct ceph_msg_footer_old);
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+ /* Initialize data cursor */
+
+ ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
+}
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off. Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->out_msg;
+
+ m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
+ dout("prepare_write_message_footer %p\n", con);
+ con_out_kvec_add(con, sizeof_footer(con), &m->footer);
+ if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
+ if (con->ops->sign_message)
+ con->ops->sign_message(m);
+ else
+ m->footer.sig = 0;
+ } else {
+ m->old_footer.flags = m->footer.flags;
+ }
+ con->v1.out_more = m->more_to_follow;
+ con->v1.out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m;
+ u32 crc;
+
+ con_out_kvec_reset(con);
+ con->v1.out_msg_done = false;
+
+ /* Sneak an ack in there first? If we can get it into the same
+ * TCP packet that's a good thing. */
+ if (con->in_seq > con->in_seq_acked) {
+ con->in_seq_acked = con->in_seq;
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+ }
+
+ ceph_con_get_out_msg(con);
+ m = con->out_msg;
+
+ dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
+ m, con->out_seq, le16_to_cpu(m->hdr.type),
+ le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+ m->data_length);
+ WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
+ WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+ /* tag + hdr + front + middle */
+ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+ con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr);
+ con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
+ if (m->middle)
+ con_out_kvec_add(con, m->middle->vec.iov_len,
+ m->middle->vec.iov_base);
+
+ /* fill in hdr crc and finalize hdr */
+ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+ con->out_msg->hdr.crc = cpu_to_le32(crc);
+ memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr));
+
+ /* fill in front and middle crc, footer */
+ crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+ con->out_msg->footer.front_crc = cpu_to_le32(crc);
+ if (m->middle) {
+ crc = crc32c(0, m->middle->vec.iov_base,
+ m->middle->vec.iov_len);
+ con->out_msg->footer.middle_crc = cpu_to_le32(crc);
+ } else
+ con->out_msg->footer.middle_crc = 0;
+ dout("%s front_crc %u middle_crc %u\n", __func__,
+ le32_to_cpu(con->out_msg->footer.front_crc),
+ le32_to_cpu(con->out_msg->footer.middle_crc));
+ con->out_msg->footer.flags = 0;
+
+ /* is there a data payload? */
+ con->out_msg->footer.data_crc = 0;
+ if (m->data_length) {
+ prepare_message_data(con->out_msg, m->data_length);
+ con->v1.out_more = 1; /* data + footer will follow */
+ } else {
+ /* no, queue up footer too and be done */
+ prepare_write_message_footer(con);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+ dout("prepare_write_ack %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+
+ con->v1.out_more = 1; /* more will follow.. eventually.. */
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+ dout("prepare_write_seq %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+ dout("prepare_write_keepalive %p\n", con);
+ con_out_kvec_reset(con);
+ if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+ struct timespec64 now;
+
+ ktime_get_real_ts64(&now);
+ con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+ ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2),
+ &con->v1.out_temp_keepalive2);
+ } else {
+ con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+ }
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static int get_connect_authorizer(struct ceph_connection *con)
+{
+ struct ceph_auth_handshake *auth;
+ int auth_proto;
+
+ if (!con->ops->get_authorizer) {
+ con->v1.auth = NULL;
+ con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+ con->v1.out_connect.authorizer_len = 0;
+ return 0;
+ }
+
+ auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry);
+ if (IS_ERR(auth))
+ return PTR_ERR(auth);
+
+ con->v1.auth = auth;
+ con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+ con->v1.out_connect.authorizer_len =
+ cpu_to_le32(auth->authorizer_buf_len);
+ return 0;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_connection *con)
+{
+ con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+ con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+ &con->msgr->my_enc_addr);
+
+ con->v1.out_more = 0;
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static void __prepare_write_connect(struct ceph_connection *con)
+{
+ con_out_kvec_add(con, sizeof(con->v1.out_connect),
+ &con->v1.out_connect);
+ if (con->v1.auth)
+ con_out_kvec_add(con, con->v1.auth->authorizer_buf_len,
+ con->v1.auth->authorizer_buf);
+
+ con->v1.out_more = 0;
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static int prepare_write_connect(struct ceph_connection *con)
+{
+ unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
+ int proto;
+ int ret;
+
+ switch (con->peer_name.type) {
+ case CEPH_ENTITY_TYPE_MON:
+ proto = CEPH_MONC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_OSD:
+ proto = CEPH_OSDC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_MDS:
+ proto = CEPH_MDSC_PROTOCOL;
+ break;
+ default:
+ BUG();
+ }
+
+ dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+ con->v1.connect_seq, global_seq, proto);
+
+ con->v1.out_connect.features =
+ cpu_to_le64(from_msgr(con->msgr)->supported_features);
+ con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+ con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq);
+ con->v1.out_connect.global_seq = cpu_to_le32(global_seq);
+ con->v1.out_connect.protocol_version = cpu_to_le32(proto);
+ con->v1.out_connect.flags = 0;
+
+ ret = get_connect_authorizer(con);
+ if (ret)
+ return ret;
+
+ __prepare_write_connect(con);
+ return 0;
+}
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ * 1 -> done
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes);
+ while (con->v1.out_kvec_bytes > 0) {
+ ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur,
+ con->v1.out_kvec_left,
+ con->v1.out_kvec_bytes,
+ con->v1.out_more);
+ if (ret <= 0)
+ goto out;
+ con->v1.out_kvec_bytes -= ret;
+ if (!con->v1.out_kvec_bytes)
+ break; /* done */
+
+ /* account for full iov entries consumed */
+ while (ret >= con->v1.out_kvec_cur->iov_len) {
+ BUG_ON(!con->v1.out_kvec_left);
+ ret -= con->v1.out_kvec_cur->iov_len;
+ con->v1.out_kvec_cur++;
+ con->v1.out_kvec_left--;
+ }
+ /* and for a partially-consumed entry */
+ if (ret) {
+ con->v1.out_kvec_cur->iov_len -= ret;
+ con->v1.out_kvec_cur->iov_base += ret;
+ }
+ }
+ con->v1.out_kvec_left = 0;
+ ret = 1;
+out:
+ dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+ con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret);
+ return ret; /* done! */
+}
+
+/*
+ * Write as much message data payload as we can. If we finish, queue
+ * up the footer.
+ * 1 -> done, footer is now queued in out_kvec[].
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_message_data(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ u32 crc;
+
+ dout("%s %p msg %p\n", __func__, con, msg);
+
+ if (!msg->num_data_items)
+ return -EINVAL;
+
+ /*
+ * Iterate through each page that contains data to be
+ * written, and send as much as possible for each.
+ *
+ * If we are calculating the data crc (the default), we will
+ * need to map the page. If we have no pages, they have
+ * been revoked, so use the zero page.
+ */
+ crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
+ while (cursor->total_resid) {
+ struct page *page;
+ size_t page_offset;
+ size_t length;
+ int ret;
+
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
+ page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+ if (length == cursor->total_resid)
+ more = MSG_MORE;
+ ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
+ more);
+ if (ret <= 0) {
+ if (do_datacrc)
+ msg->footer.data_crc = cpu_to_le32(crc);
+
+ return ret;
+ }
+ if (do_datacrc && cursor->need_crc)
+ crc = ceph_crc32c_page(crc, page, page_offset, length);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ }
+
+ dout("%s %p msg %p done\n", __func__, con, msg);
+
+ /* prepare and queue up footer, too */
+ if (do_datacrc)
+ msg->footer.data_crc = cpu_to_le32(crc);
+ else
+ msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+ con_out_kvec_reset(con);
+ prepare_write_message_footer(con);
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+ int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ int ret;
+
+ dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
+ while (con->v1.out_skip > 0) {
+ size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
+
+ if (size == con->v1.out_skip)
+ more = MSG_MORE;
+ ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+ more);
+ if (ret <= 0)
+ goto out;
+ con->v1.out_skip -= ret;
+ }
+ ret = 1;
+out:
+ return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+ dout("prepare_read_banner %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+ dout("prepare_read_connect %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_ack %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_seq(struct ceph_connection *con)
+{
+ dout("prepare_read_seq %p\n", con);
+ con->v1.in_base_pos = 0;
+ con->v1.in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+ dout("prepare_read_tag %p\n", con);
+ con->v1.in_base_pos = 0;
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+}
+
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_keepalive_ack %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+ dout("prepare_read_message %p\n", con);
+ BUG_ON(con->in_msg != NULL);
+ con->v1.in_base_pos = 0;
+ con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+ return 0;
+}
+
+static int read_partial(struct ceph_connection *con,
+ int end, int size, void *object)
+{
+ while (con->v1.in_base_pos < end) {
+ int left = end - con->v1.in_base_pos;
+ int have = size - left;
+ int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+ if (ret <= 0)
+ return ret;
+ con->v1.in_base_pos += ret;
+ }
+ return 1;
+}
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+ int size;
+ int end;
+ int ret;
+
+ dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos);
+
+ /* peer's banner */
+ size = strlen(CEPH_BANNER);
+ end = size;
+ ret = read_partial(con, end, size, con->v1.in_banner);
+ if (ret <= 0)
+ goto out;
+
+ size = sizeof(con->v1.actual_peer_addr);
+ end += size;
+ ret = read_partial(con, end, size, &con->v1.actual_peer_addr);
+ if (ret <= 0)
+ goto out;
+ ceph_decode_banner_addr(&con->v1.actual_peer_addr);
+
+ size = sizeof(con->v1.peer_addr_for_me);
+ end += size;
+ ret = read_partial(con, end, size, &con->v1.peer_addr_for_me);
+ if (ret <= 0)
+ goto out;
+ ceph_decode_banner_addr(&con->v1.peer_addr_for_me);
+
+out:
+ return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+ int size;
+ int end;
+ int ret;
+
+ dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos);
+
+ size = sizeof(con->v1.in_reply);
+ end = size;
+ ret = read_partial(con, end, size, &con->v1.in_reply);
+ if (ret <= 0)
+ goto out;
+
+ if (con->v1.auth) {
+ size = le32_to_cpu(con->v1.in_reply.authorizer_len);
+ if (size > con->v1.auth->authorizer_reply_buf_len) {
+ pr_err("authorizer reply too big: %d > %zu\n", size,
+ con->v1.auth->authorizer_reply_buf_len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ end += size;
+ ret = read_partial(con, end, size,
+ con->v1.auth->authorizer_reply_buf);
+ if (ret <= 0)
+ goto out;
+ }
+
+ dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+ con, con->v1.in_reply.tag,
+ le32_to_cpu(con->v1.in_reply.connect_seq),
+ le32_to_cpu(con->v1.in_reply.global_seq));
+out:
+ return ret;
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+ if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+ pr_err("connect to %s got bad banner\n",
+ ceph_pr_addr(&con->peer_addr));
+ con->error_msg = "protocol error, bad banner";
+ return -1;
+ }
+ return 0;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+
+ dout("process_banner on %p\n", con);
+
+ if (verify_hello(con) < 0)
+ return -1;
+
+ /*
+ * Make sure the other end is who we wanted. note that the other
+ * end may not yet know their ip address, so if it's 0.0.0.0, give
+ * them the benefit of the doubt.
+ */
+ if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr,
+ sizeof(con->peer_addr)) != 0 &&
+ !(ceph_addr_is_blank(&con->v1.actual_peer_addr) &&
+ con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) {
+ pr_warn("wrong peer, want %s/%u, got %s/%u\n",
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->peer_addr.nonce),
+ ceph_pr_addr(&con->v1.actual_peer_addr),
+ le32_to_cpu(con->v1.actual_peer_addr.nonce));
+ con->error_msg = "wrong peer at address";
+ return -1;
+ }
+
+ /*
+ * did we learn our address?
+ */
+ if (ceph_addr_is_blank(my_addr)) {
+ memcpy(&my_addr->in_addr,
+ &con->v1.peer_addr_for_me.in_addr,
+ sizeof(con->v1.peer_addr_for_me.in_addr));
+ ceph_addr_set_port(my_addr, 0);
+ ceph_encode_my_addr(con->msgr);
+ dout("process_banner learned my addr is %s\n",
+ ceph_pr_addr(my_addr));
+ }
+
+ return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+ u64 sup_feat = from_msgr(con->msgr)->supported_features;
+ u64 req_feat = from_msgr(con->msgr)->required_features;
+ u64 server_feat = le64_to_cpu(con->v1.in_reply.features);
+ int ret;
+
+ dout("process_connect on %p tag %d\n", con, con->v1.in_tag);
+
+ if (con->v1.auth) {
+ int len = le32_to_cpu(con->v1.in_reply.authorizer_len);
+
+ /*
+ * Any connection that defines ->get_authorizer()
+ * should also define ->add_authorizer_challenge() and
+ * ->verify_authorizer_reply().
+ *
+ * See get_connect_authorizer().
+ */
+ if (con->v1.in_reply.tag ==
+ CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+ ret = con->ops->add_authorizer_challenge(
+ con, con->v1.auth->authorizer_reply_buf, len);
+ if (ret < 0)
+ return ret;
+
+ con_out_kvec_reset(con);
+ __prepare_write_connect(con);
+ prepare_read_connect(con);
+ return 0;
+ }
+
+ if (len) {
+ ret = con->ops->verify_authorizer_reply(con);
+ if (ret < 0) {
+ con->error_msg = "bad authorize reply";
+ return ret;
+ }
+ }
+ }
+
+ switch (con->v1.in_reply.tag) {
+ case CEPH_MSGR_TAG_FEATURES:
+ pr_err("%s%lld %s feature set mismatch,"
+ " my %llx < server's %llx, missing %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ sup_feat, server_feat, server_feat & ~sup_feat);
+ con->error_msg = "missing required protocol features";
+ return -1;
+
+ case CEPH_MSGR_TAG_BADPROTOVER:
+ pr_err("%s%lld %s protocol version mismatch,"
+ " my %d != server's %d\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->v1.out_connect.protocol_version),
+ le32_to_cpu(con->v1.in_reply.protocol_version));
+ con->error_msg = "protocol version mismatch";
+ return -1;
+
+ case CEPH_MSGR_TAG_BADAUTHORIZER:
+ con->v1.auth_retry++;
+ dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+ con->v1.auth_retry);
+ if (con->v1.auth_retry == 2) {
+ con->error_msg = "connect authorization failure";
+ return -1;
+ }
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RESETSESSION:
+ /*
+ * If we connected with a large connect_seq but the peer
+ * has no record of a session with us (no connection, or
+ * connect_seq == 0), they will send RESETSESION to indicate
+ * that they must have reset their session, and may have
+ * dropped messages.
+ */
+ dout("process_connect got RESET peer seq %u\n",
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+ pr_info("%s%lld %s session reset\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr));
+ ceph_con_reset_session(con);
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+
+ /* Tell ceph about it. */
+ mutex_unlock(&con->mutex);
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
+ return -EAGAIN;
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_SESSION:
+ /*
+ * If we sent a smaller connect_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
+ le32_to_cpu(con->v1.out_connect.connect_seq),
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+ con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq);
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_GLOBAL:
+ /*
+ * If we sent a smaller global_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+ con->v1.peer_global_seq,
+ le32_to_cpu(con->v1.in_reply.global_seq));
+ ceph_get_global_seq(con->msgr,
+ le32_to_cpu(con->v1.in_reply.global_seq));
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_SEQ:
+ case CEPH_MSGR_TAG_READY:
+ if (req_feat & ~server_feat) {
+ pr_err("%s%lld %s protocol feature mismatch,"
+ " my required %llx > server's %llx, need %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ req_feat, server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ return -1;
+ }
+
+ WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+ con->state = CEPH_CON_S_OPEN;
+ con->v1.auth_retry = 0; /* we authenticated; clear flag */
+ con->v1.peer_global_seq =
+ le32_to_cpu(con->v1.in_reply.global_seq);
+ con->v1.connect_seq++;
+ con->peer_features = server_feat;
+ dout("process_connect got READY gseq %d cseq %d (%d)\n",
+ con->v1.peer_global_seq,
+ le32_to_cpu(con->v1.in_reply.connect_seq),
+ con->v1.connect_seq);
+ WARN_ON(con->v1.connect_seq !=
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+
+ if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+ ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+
+ con->delay = 0; /* reset backoff memory */
+
+ if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+ prepare_write_seq(con);
+ prepare_read_seq(con);
+ } else {
+ prepare_read_tag(con);
+ }
+ break;
+
+ case CEPH_MSGR_TAG_WAIT:
+ /*
+ * If there is a connection race (we are opening
+ * connections to each other), one of us may just have
+ * to WAIT. This shouldn't happen if we are the
+ * client.
+ */
+ con->error_msg = "protocol error, got WAIT as client";
+ return -1;
+
+ default:
+ con->error_msg = "protocol error, garbage tag during connect";
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+ int size = sizeof(con->v1.in_temp_ack);
+ int end = size;
+
+ return read_partial(con, end, size, &con->v1.in_temp_ack);
+}
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+ u64 ack = le64_to_cpu(con->v1.in_temp_ack);
+
+ if (con->v1.in_tag == CEPH_MSGR_TAG_ACK)
+ ceph_con_discard_sent(con, ack);
+ else
+ ceph_con_discard_requeued(con, ack);
+
+ prepare_read_tag(con);
+}
+
+static int read_partial_message_section(struct ceph_connection *con,
+ struct kvec *section,
+ unsigned int sec_len, u32 *crc)
+{
+ int ret, left;
+
+ BUG_ON(!section);
+
+ while (section->iov_len < sec_len) {
+ BUG_ON(section->iov_base == NULL);
+ left = sec_len - section->iov_len;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+ section->iov_len, left);
+ if (ret <= 0)
+ return ret;
+ section->iov_len += ret;
+ }
+ if (section->iov_len == sec_len)
+ *crc = crc32c(0, section->iov_base, section->iov_len);
+
+ return 1;
+}
+
+static int read_partial_msg_data(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->in_msg;
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ struct page *page;
+ size_t page_offset;
+ size_t length;
+ u32 crc = 0;
+ int ret;
+
+ if (!msg->num_data_items)
+ return -EIO;
+
+ if (do_datacrc)
+ crc = con->in_data_crc;
+ while (cursor->total_resid) {
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
+ page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+ ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
+ if (ret <= 0) {
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return ret;
+ }
+
+ if (do_datacrc)
+ crc = ceph_crc32c_page(crc, page, page_offset, ret);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ }
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->in_msg;
+ int size;
+ int end;
+ int ret;
+ unsigned int front_len, middle_len, data_len;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
+ u64 seq;
+ u32 crc;
+
+ dout("read_partial_message con %p msg %p\n", con, m);
+
+ /* header */
+ size = sizeof(con->v1.in_hdr);
+ end = size;
+ ret = read_partial(con, end, size, &con->v1.in_hdr);
+ if (ret <= 0)
+ return ret;
+
+ crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc));
+ if (cpu_to_le32(crc) != con->v1.in_hdr.crc) {
+ pr_err("read_partial_message bad hdr crc %u != expected %u\n",
+ crc, con->v1.in_hdr.crc);
+ return -EBADMSG;
+ }
+
+ front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+ if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+ return -EIO;
+ middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+ if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
+ return -EIO;
+ data_len = le32_to_cpu(con->v1.in_hdr.data_len);
+ if (data_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+
+ /* verify seq# */
+ seq = le64_to_cpu(con->v1.in_hdr.seq);
+ if ((s64)seq - (s64)con->in_seq < 1) {
+ pr_info("skipping %s%lld %s seq %lld expected %lld\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ seq, con->in_seq + 1);
+ con->v1.in_base_pos = -front_len - middle_len - data_len -
+ sizeof_footer(con);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ return 1;
+ } else if ((s64)seq - (s64)con->in_seq > 1) {
+ pr_err("read_partial_message bad seq %lld expected %lld\n",
+ seq, con->in_seq + 1);
+ con->error_msg = "bad message sequence # for incoming message";
+ return -EBADE;
+ }
+
+ /* allocate message? */
+ if (!con->in_msg) {
+ int skip = 0;
+
+ dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type,
+ front_len, data_len);
+ ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip);
+ if (ret < 0)
+ return ret;
+
+ BUG_ON((!con->in_msg) ^ skip);
+ if (skip) {
+ /* skip this message */
+ dout("alloc_msg said skip message\n");
+ con->v1.in_base_pos = -front_len - middle_len -
+ data_len - sizeof_footer(con);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+ return 1;
+ }
+
+ BUG_ON(!con->in_msg);
+ BUG_ON(con->in_msg->con != con);
+ m = con->in_msg;
+ m->front.iov_len = 0; /* haven't read it yet */
+ if (m->middle)
+ m->middle->vec.iov_len = 0;
+
+ /* prepare for data payload, if any */
+
+ if (data_len)
+ prepare_message_data(con->in_msg, data_len);
+ }
+
+ /* front */
+ ret = read_partial_message_section(con, &m->front, front_len,
+ &con->in_front_crc);
+ if (ret <= 0)
+ return ret;
+
+ /* middle */
+ if (m->middle) {
+ ret = read_partial_message_section(con, &m->middle->vec,
+ middle_len,
+ &con->in_middle_crc);
+ if (ret <= 0)
+ return ret;
+ }
+
+ /* (page) data */
+ if (data_len) {
+ ret = read_partial_msg_data(con);
+ if (ret <= 0)
+ return ret;
+ }
+
+ /* footer */
+ size = sizeof_footer(con);
+ end += size;
+ ret = read_partial(con, end, size, &m->footer);
+ if (ret <= 0)
+ return ret;
+
+ if (!need_sign) {
+ m->footer.flags = m->old_footer.flags;
+ m->footer.sig = 0;
+ }
+
+ dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+ m, front_len, m->footer.front_crc, middle_len,
+ m->footer.middle_crc, data_len, m->footer.data_crc);
+
+ /* crc ok? */
+ if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+ pr_err("read_partial_message %p front crc %u != exp. %u\n",
+ m, con->in_front_crc, m->footer.front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+ pr_err("read_partial_message %p middle crc %u != exp %u\n",
+ m, con->in_middle_crc, m->footer.middle_crc);
+ return -EBADMSG;
+ }
+ if (do_datacrc &&
+ (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+ con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+ pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+ con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+ return -EBADMSG;
+ }
+
+ if (need_sign && con->ops->check_message_signature &&
+ con->ops->check_message_signature(m)) {
+ pr_err("read_partial_message %p signature check failed\n", m);
+ return -EBADMSG;
+ }
+
+ return 1; /* done! */
+}
+
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+ struct ceph_timespec ceph_ts;
+ size_t size = sizeof(ceph_ts);
+ int ret = read_partial(con, size, size, &ceph_ts);
+ if (ret <= 0)
+ return ret;
+ ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
+ prepare_read_tag(con);
+ return 1;
+}
+
+/*
+ * Read what we can from the socket.
+ */
+int ceph_con_v1_try_read(struct ceph_connection *con)
+{
+ int ret = -1;
+
+more:
+ dout("try_read start %p state %d\n", con, con->state);
+ if (con->state != CEPH_CON_S_V1_BANNER &&
+ con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+ con->state != CEPH_CON_S_OPEN)
+ return 0;
+
+ BUG_ON(!con->sock);
+
+ dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag,
+ con->v1.in_base_pos);
+
+ if (con->state == CEPH_CON_S_V1_BANNER) {
+ ret = read_partial_banner(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_banner(con);
+ if (ret < 0)
+ goto out;
+
+ con->state = CEPH_CON_S_V1_CONNECT_MSG;
+
+ /*
+ * Received banner is good, exchange connection info.
+ * Do not reset out_kvec, as sending our banner raced
+ * with receiving peer banner after connect completed.
+ */
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ goto out;
+ prepare_read_connect(con);
+
+ /* Send connection info before awaiting response */
+ goto out;
+ }
+
+ if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
+ ret = read_partial_connect(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_connect(con);
+ if (ret < 0)
+ goto out;
+ goto more;
+ }
+
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+
+ if (con->v1.in_base_pos < 0) {
+ /*
+ * skipping + discarding content.
+ */
+ ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos);
+ if (ret <= 0)
+ goto out;
+ dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos);
+ con->v1.in_base_pos += ret;
+ if (con->v1.in_base_pos)
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_READY) {
+ /*
+ * what's next?
+ */
+ ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1);
+ if (ret <= 0)
+ goto out;
+ dout("try_read got tag %d\n", con->v1.in_tag);
+ switch (con->v1.in_tag) {
+ case CEPH_MSGR_TAG_MSG:
+ prepare_read_message(con);
+ break;
+ case CEPH_MSGR_TAG_ACK:
+ prepare_read_ack(con);
+ break;
+ case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+ prepare_read_keepalive_ack(con);
+ break;
+ case CEPH_MSGR_TAG_CLOSE:
+ ceph_con_close_socket(con);
+ con->state = CEPH_CON_S_CLOSED;
+ goto out;
+ default:
+ goto bad_tag;
+ }
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) {
+ ret = read_partial_message(con);
+ if (ret <= 0) {
+ switch (ret) {
+ case -EBADMSG:
+ con->error_msg = "bad crc/signature";
+ fallthrough;
+ case -EBADE:
+ ret = -EIO;
+ break;
+ case -EIO:
+ con->error_msg = "io error";
+ break;
+ }
+ goto out;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_READY)
+ goto more;
+ ceph_con_process_message(con);
+ if (con->state == CEPH_CON_S_OPEN)
+ prepare_read_tag(con);
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_ACK ||
+ con->v1.in_tag == CEPH_MSGR_TAG_SEQ) {
+ /*
+ * the final handshake seq exchange is semantically
+ * equivalent to an ACK
+ */
+ ret = read_partial_ack(con);
+ if (ret <= 0)
+ goto out;
+ process_ack(con);
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+ ret = read_keepalive_ack(con);
+ if (ret <= 0)
+ goto out;
+ goto more;
+ }
+
+out:
+ dout("try_read done on %p ret %d\n", con, ret);
+ return ret;
+
+bad_tag:
+ pr_err("try_read bad tag %d\n", con->v1.in_tag);
+ con->error_msg = "protocol error, garbage tag";
+ ret = -1;
+ goto out;
+}
+
+/*
+ * Write something to the socket. Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+int ceph_con_v1_try_write(struct ceph_connection *con)
+{
+ int ret = 1;
+
+ dout("try_write start %p state %d\n", con, con->state);
+ if (con->state != CEPH_CON_S_PREOPEN &&
+ con->state != CEPH_CON_S_V1_BANNER &&
+ con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+ con->state != CEPH_CON_S_OPEN)
+ return 0;
+
+ /* open the socket first? */
+ if (con->state == CEPH_CON_S_PREOPEN) {
+ BUG_ON(con->sock);
+ con->state = CEPH_CON_S_V1_BANNER;
+
+ con_out_kvec_reset(con);
+ prepare_write_banner(con);
+ prepare_read_banner(con);
+
+ BUG_ON(con->in_msg);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ dout("try_write initiating connect on %p new state %d\n",
+ con, con->state);
+ ret = ceph_tcp_connect(con);
+ if (ret < 0) {
+ con->error_msg = "connect error";
+ goto out;
+ }
+ }
+
+more:
+ dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes);
+ BUG_ON(!con->sock);
+
+ /* kvec data queued? */
+ if (con->v1.out_kvec_left) {
+ ret = write_partial_kvec(con);
+ if (ret <= 0)
+ goto out;
+ }
+ if (con->v1.out_skip) {
+ ret = write_partial_skip(con);
+ if (ret <= 0)
+ goto out;
+ }
+
+ /* msg pages? */
+ if (con->out_msg) {
+ if (con->v1.out_msg_done) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL; /* we're done with this one */
+ goto do_next;
+ }
+
+ ret = write_partial_message_data(con);
+ if (ret == 1)
+ goto more; /* we need to send the footer, too! */
+ if (ret == 0)
+ goto out;
+ if (ret < 0) {
+ dout("try_write write_partial_message_data err %d\n",
+ ret);
+ goto out;
+ }
+ }
+
+do_next:
+ if (con->state == CEPH_CON_S_OPEN) {
+ if (ceph_con_flag_test_and_clear(con,
+ CEPH_CON_F_KEEPALIVE_PENDING)) {
+ prepare_write_keepalive(con);
+ goto more;
+ }
+ /* is anything else pending? */
+ if (!list_empty(&con->out_queue)) {
+ prepare_write_message(con);
+ goto more;
+ }
+ if (con->in_seq > con->in_seq_acked) {
+ prepare_write_ack(con);
+ goto more;
+ }
+ }
+
+ /* Nothing to do! */
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ dout("try_write nothing else to write.\n");
+ ret = 0;
+out:
+ dout("try_write done on %p ret %d\n", con, ret);
+ return ret;
+}
+
+void ceph_con_v1_revoke(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+
+ WARN_ON(con->v1.out_skip);
+ /* footer */
+ if (con->v1.out_msg_done) {
+ con->v1.out_skip += con_out_kvec_skip(con);
+ } else {
+ WARN_ON(!msg->data_length);
+ con->v1.out_skip += sizeof_footer(con);
+ }
+ /* data, middle, front */
+ if (msg->data_length)
+ con->v1.out_skip += msg->cursor.total_resid;
+ if (msg->middle)
+ con->v1.out_skip += con_out_kvec_skip(con);
+ con->v1.out_skip += con_out_kvec_skip(con);
+
+ dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+ con->v1.out_kvec_bytes, con->v1.out_skip);
+}
+
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+ unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+ unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+ unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len);
+
+ /* skip rest of message */
+ con->v1.in_base_pos = con->v1.in_base_pos -
+ sizeof(struct ceph_msg_header) -
+ front_len -
+ middle_len -
+ data_len -
+ sizeof(struct ceph_msg_footer);
+
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+
+ dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos);
+}
+
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+ return con->v1.connect_seq;
+}
+
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+ con->v1.connect_seq = 0;
+ con->v1.peer_global_seq = 0;
+}
+
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+ con->v1.out_skip = 0;
+}
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
new file mode 100644
index 000000000000..cc40ce4e02fb
--- /dev/null
+++ b/net/ceph/messenger_v2.c
@@ -0,0 +1,3459 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph msgr2 protocol implementation
+ *
+ * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com>
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <crypto/aead.h>
+#include <crypto/algapi.h> /* for crypto_memneq() */
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/sched/mm.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h" /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */
+
+#define FRAME_TAG_HELLO 1
+#define FRAME_TAG_AUTH_REQUEST 2
+#define FRAME_TAG_AUTH_BAD_METHOD 3
+#define FRAME_TAG_AUTH_REPLY_MORE 4
+#define FRAME_TAG_AUTH_REQUEST_MORE 5
+#define FRAME_TAG_AUTH_DONE 6
+#define FRAME_TAG_AUTH_SIGNATURE 7
+#define FRAME_TAG_CLIENT_IDENT 8
+#define FRAME_TAG_SERVER_IDENT 9
+#define FRAME_TAG_IDENT_MISSING_FEATURES 10
+#define FRAME_TAG_SESSION_RECONNECT 11
+#define FRAME_TAG_SESSION_RESET 12
+#define FRAME_TAG_SESSION_RETRY 13
+#define FRAME_TAG_SESSION_RETRY_GLOBAL 14
+#define FRAME_TAG_SESSION_RECONNECT_OK 15
+#define FRAME_TAG_WAIT 16
+#define FRAME_TAG_MESSAGE 17
+#define FRAME_TAG_KEEPALIVE2 18
+#define FRAME_TAG_KEEPALIVE2_ACK 19
+#define FRAME_TAG_ACK 20
+
+#define FRAME_LATE_STATUS_ABORTED 0x1
+#define FRAME_LATE_STATUS_COMPLETE 0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK 0xf
+
+#define IN_S_HANDLE_PREAMBLE 1
+#define IN_S_HANDLE_CONTROL 2
+#define IN_S_HANDLE_CONTROL_REMAINDER 3
+#define IN_S_PREPARE_READ_DATA 4
+#define IN_S_PREPARE_READ_DATA_CONT 5
+#define IN_S_HANDLE_EPILOGUE 6
+#define IN_S_FINISH_SKIP 7
+
+#define OUT_S_QUEUE_DATA 1
+#define OUT_S_QUEUE_DATA_CONT 2
+#define OUT_S_QUEUE_ENC_PAGE 3
+#define OUT_S_QUEUE_ZEROS 4
+#define OUT_S_FINISH_MESSAGE 5
+#define OUT_S_GET_NEXT 6
+
+#define CTRL_BODY(p) ((void *)(p) + CEPH_PREAMBLE_LEN)
+#define FRONT_PAD(p) ((void *)(p) + CEPH_EPILOGUE_SECURE_LEN)
+#define MIDDLE_PAD(p) (FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN)
+#define DATA_PAD(p) (MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN)
+
+#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int do_recvmsg(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ int ret;
+
+ msg.msg_iter = *it;
+ while (iov_iter_count(it)) {
+ ret = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ WARN_ON(msg_data_left(&msg));
+ return 1;
+}
+
+/*
+ * Read as much as possible.
+ *
+ * Return:
+ * 1 - done, nothing (else) to read
+ * 0 - socket is empty, need to wait
+ * <0 - error
+ */
+static int ceph_tcp_recv(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p %s %zu\n", __func__, con,
+ iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need",
+ iov_iter_count(&con->v2.in_iter));
+ ret = do_recvmsg(con->sock, &con->v2.in_iter);
+ dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+ iov_iter_count(&con->v2.in_iter));
+ return ret;
+}
+
+static int do_sendmsg(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ int ret;
+
+ msg.msg_iter = *it;
+ while (iov_iter_count(it)) {
+ ret = sock_sendmsg(sock, &msg);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ WARN_ON(msg_data_left(&msg));
+ return 1;
+}
+
+static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ struct bio_vec bv;
+ int ret;
+
+ if (WARN_ON(!iov_iter_is_bvec(it)))
+ return -EINVAL;
+
+ while (iov_iter_count(it)) {
+ /* iov_iter_iovec() for ITER_BVEC */
+ bv.bv_page = it->bvec->bv_page;
+ bv.bv_offset = it->bvec->bv_offset + it->iov_offset;
+ bv.bv_len = min(iov_iter_count(it),
+ it->bvec->bv_len - it->iov_offset);
+
+ /*
+ * sendpage cannot properly handle pages with
+ * page_count == 0, we need to fall back to sendmsg if
+ * that's the case.
+ *
+ * Same goes for slab pages: skb_can_coalesce() allows
+ * coalescing neighboring slab objects into a single frag
+ * which triggers one of hardened usercopy checks.
+ */
+ if (sendpage_ok(bv.bv_page)) {
+ ret = sock->ops->sendpage(sock, bv.bv_page,
+ bv.bv_offset, bv.bv_len,
+ CEPH_MSG_FLAGS);
+ } else {
+ iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len);
+ ret = sock_sendmsg(sock, &msg);
+ }
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ return 1;
+}
+
+/*
+ * Write as much as possible. The socket is expected to be corked,
+ * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
+ *
+ * Return:
+ * 1 - done, nothing (else) to write
+ * 0 - socket is full, need to wait
+ * <0 - error
+ */
+static int ceph_tcp_send(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
+ iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
+ if (con->v2.out_iter_sendpage)
+ ret = do_try_sendpage(con->sock, &con->v2.out_iter);
+ else
+ ret = do_sendmsg(con->sock, &con->v2.out_iter);
+ dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+ iov_iter_count(&con->v2.out_iter));
+ return ret;
+}
+
+static void add_in_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+
+ con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf;
+ con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len;
+ con->v2.in_kvec_cnt++;
+
+ con->v2.in_iter.nr_segs++;
+ con->v2.in_iter.count += len;
+}
+
+static void reset_in_kvecs(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ con->v2.in_kvec_cnt = 0;
+ iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0);
+}
+
+static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ con->v2.in_bvec = *bv;
+ iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len);
+}
+
+static void set_in_skip(struct ceph_connection *con, int len)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ dout("%s con %p len %d\n", __func__, con, len);
+ iov_iter_discard(&con->v2.in_iter, READ, len);
+}
+
+static void add_out_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf;
+ con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len;
+ con->v2.out_kvec_cnt++;
+
+ con->v2.out_iter.nr_segs++;
+ con->v2.out_iter.count += len;
+}
+
+static void reset_out_kvecs(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_kvec_cnt = 0;
+
+ iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0);
+ con->v2.out_iter_sendpage = false;
+}
+
+static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv,
+ bool zerocopy)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_bvec = *bv;
+ con->v2.out_iter_sendpage = zerocopy;
+ iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+ con->v2.out_bvec.bv_len);
+}
+
+static void set_out_bvec_zero(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(!con->v2.out_zero);
+
+ con->v2.out_bvec.bv_page = ceph_zero_page;
+ con->v2.out_bvec.bv_offset = 0;
+ con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE);
+ con->v2.out_iter_sendpage = true;
+ iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+ con->v2.out_bvec.bv_len);
+}
+
+static void out_zero_add(struct ceph_connection *con, int len)
+{
+ dout("%s con %p len %d\n", __func__, con, len);
+ con->v2.out_zero += len;
+}
+
+static void *alloc_conn_buf(struct ceph_connection *con, int len)
+{
+ void *buf;
+
+ dout("%s con %p len %d\n", __func__, con, len);
+
+ if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
+ return NULL;
+
+ buf = ceph_kvmalloc(len, GFP_NOIO);
+ if (!buf)
+ return NULL;
+
+ con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf;
+ return buf;
+}
+
+static void free_conn_bufs(struct ceph_connection *con)
+{
+ while (con->v2.conn_buf_cnt)
+ kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]);
+}
+
+static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs));
+
+ con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf;
+ con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len;
+ con->v2.in_sign_kvec_cnt++;
+}
+
+static void clear_in_sign_kvecs(struct ceph_connection *con)
+{
+ con->v2.in_sign_kvec_cnt = 0;
+}
+
+static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs));
+
+ con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf;
+ con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len;
+ con->v2.out_sign_kvec_cnt++;
+}
+
+static void clear_out_sign_kvecs(struct ceph_connection *con)
+{
+ con->v2.out_sign_kvec_cnt = 0;
+}
+
+static bool con_secure(struct ceph_connection *con)
+{
+ return con->v2.con_mode == CEPH_CON_MODE_SECURE;
+}
+
+static int front_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.front_len);
+}
+
+static int middle_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.middle_len);
+}
+
+static int data_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.data_len);
+}
+
+static bool need_padding(int len)
+{
+ return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padded_len(int len)
+{
+ return ALIGN(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padding_len(int len)
+{
+ return padded_len(len) - len;
+}
+
+/* preamble + control segment */
+static int head_onwire_len(int ctrl_len, bool secure)
+{
+ int head_len;
+ int rem_len;
+
+ if (secure) {
+ head_len = CEPH_PREAMBLE_SECURE_LEN;
+ if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+ rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN;
+ }
+ } else {
+ head_len = CEPH_PREAMBLE_PLAIN_LEN;
+ if (ctrl_len)
+ head_len += ctrl_len + CEPH_CRC_LEN;
+ }
+ return head_len;
+}
+
+/* front, middle and data segments + epilogue */
+static int __tail_onwire_len(int front_len, int middle_len, int data_len,
+ bool secure)
+{
+ if (!front_len && !middle_len && !data_len)
+ return 0;
+
+ if (!secure)
+ return front_len + middle_len + data_len +
+ CEPH_EPILOGUE_PLAIN_LEN;
+
+ return padded_len(front_len) + padded_len(middle_len) +
+ padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN;
+}
+
+static int tail_onwire_len(const struct ceph_msg *msg, bool secure)
+{
+ return __tail_onwire_len(front_len(msg), middle_len(msg),
+ data_len(msg), secure);
+}
+
+/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */
+#define MESSAGE_HEAD_PLAIN_LEN (CEPH_PREAMBLE_PLAIN_LEN + \
+ sizeof(struct ceph_msg_header2) + \
+ CEPH_CRC_LEN)
+
+static const int frame_aligns[] = {
+ sizeof(void *),
+ sizeof(void *),
+ sizeof(void *),
+ PAGE_SIZE
+};
+
+/*
+ * Discards trailing empty segments, unless there is just one segment.
+ * A frame always has at least one (possibly empty) segment.
+ */
+static int calc_segment_count(const int *lens, int len_cnt)
+{
+ int i;
+
+ for (i = len_cnt - 1; i >= 0; i--) {
+ if (lens[i])
+ return i + 1;
+ }
+
+ return 1;
+}
+
+static void init_frame_desc(struct ceph_frame_desc *desc, int tag,
+ const int *lens, int len_cnt)
+{
+ int i;
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->fd_tag = tag;
+ desc->fd_seg_cnt = calc_segment_count(lens, len_cnt);
+ BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT);
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ desc->fd_lens[i] = lens[i];
+ desc->fd_aligns[i] = frame_aligns[i];
+ }
+}
+
+/*
+ * Preamble crc covers everything up to itself (28 bytes) and
+ * is calculated and verified irrespective of the connection mode
+ * (i.e. even if the frame is encrypted).
+ */
+static void encode_preamble(const struct ceph_frame_desc *desc, void *p)
+{
+ void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+ void *start = p;
+ int i;
+
+ memset(p, 0, CEPH_PREAMBLE_LEN);
+
+ ceph_encode_8(&p, desc->fd_tag);
+ ceph_encode_8(&p, desc->fd_seg_cnt);
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ ceph_encode_32(&p, desc->fd_lens[i]);
+ ceph_encode_16(&p, desc->fd_aligns[i]);
+ }
+
+ put_unaligned_le32(crc32c(0, start, crcp - start), crcp);
+}
+
+static int decode_preamble(void *p, struct ceph_frame_desc *desc)
+{
+ void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+ u32 crc, expected_crc;
+ int i;
+
+ crc = crc32c(0, p, crcp - p);
+ expected_crc = get_unaligned_le32(crcp);
+ if (crc != expected_crc) {
+ pr_err("bad preamble crc, calculated %u, expected %u\n",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->fd_tag = ceph_decode_8(&p);
+ desc->fd_seg_cnt = ceph_decode_8(&p);
+ if (desc->fd_seg_cnt < 1 ||
+ desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) {
+ pr_err("bad segment count %d\n", desc->fd_seg_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ desc->fd_lens[i] = ceph_decode_32(&p);
+ desc->fd_aligns[i] = ceph_decode_16(&p);
+ }
+
+ /*
+ * This would fire for FRAME_TAG_WAIT (it has one empty
+ * segment), but we should never get it as client.
+ */
+ if (!desc->fd_lens[desc->fd_seg_cnt - 1]) {
+ pr_err("last segment empty\n");
+ return -EINVAL;
+ }
+
+ if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) {
+ pr_err("control segment too big %d\n", desc->fd_lens[0]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) {
+ pr_err("front segment too big %d\n", desc->fd_lens[1]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) {
+ pr_err("middle segment too big %d\n", desc->fd_lens[2]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) {
+ pr_err("data segment too big %d\n", desc->fd_lens[3]);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void encode_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+ con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+ FRAME_LATE_STATUS_COMPLETE;
+ cpu_to_le32s(&con->v2.out_epil.front_crc);
+ cpu_to_le32s(&con->v2.out_epil.middle_crc);
+ cpu_to_le32s(&con->v2.out_epil.data_crc);
+}
+
+static void encode_epilogue_secure(struct ceph_connection *con, bool aborted)
+{
+ memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil));
+ con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+ FRAME_LATE_STATUS_COMPLETE;
+}
+
+static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc,
+ u32 *data_crc)
+{
+ u8 late_status;
+
+ late_status = ceph_decode_8(&p);
+ if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) !=
+ FRAME_LATE_STATUS_COMPLETE) {
+ /* we should never get an aborted message as client */
+ pr_err("bad late_status 0x%x\n", late_status);
+ return -EINVAL;
+ }
+
+ if (front_crc && middle_crc && data_crc) {
+ *front_crc = ceph_decode_32(&p);
+ *middle_crc = ceph_decode_32(&p);
+ *data_crc = ceph_decode_32(&p);
+ }
+
+ return 0;
+}
+
+static void fill_header(struct ceph_msg_header *hdr,
+ const struct ceph_msg_header2 *hdr2,
+ int front_len, int middle_len, int data_len,
+ const struct ceph_entity_name *peer_name)
+{
+ hdr->seq = hdr2->seq;
+ hdr->tid = hdr2->tid;
+ hdr->type = hdr2->type;
+ hdr->priority = hdr2->priority;
+ hdr->version = hdr2->version;
+ hdr->front_len = cpu_to_le32(front_len);
+ hdr->middle_len = cpu_to_le32(middle_len);
+ hdr->data_len = cpu_to_le32(data_len);
+ hdr->data_off = hdr2->data_off;
+ hdr->src = *peer_name;
+ hdr->compat_version = hdr2->compat_version;
+ hdr->reserved = 0;
+ hdr->crc = 0;
+}
+
+static void fill_header2(struct ceph_msg_header2 *hdr2,
+ const struct ceph_msg_header *hdr, u64 ack_seq)
+{
+ hdr2->seq = hdr->seq;
+ hdr2->tid = hdr->tid;
+ hdr2->type = hdr->type;
+ hdr2->priority = hdr->priority;
+ hdr2->version = hdr->version;
+ hdr2->data_pre_padding_len = 0;
+ hdr2->data_off = hdr->data_off;
+ hdr2->ack_seq = cpu_to_le64(ack_seq);
+ hdr2->flags = 0;
+ hdr2->compat_version = hdr->compat_version;
+ hdr2->reserved = 0;
+}
+
+static int verify_control_crc(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ u32 crc, expected_crc;
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len);
+ WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN);
+
+ crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len);
+ expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base);
+ if (crc != expected_crc) {
+ pr_err("bad control crc, calculated %u, expected %u\n",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc,
+ u32 middle_crc, u32 data_crc)
+{
+ if (front_len(con->in_msg)) {
+ con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base,
+ front_len(con->in_msg));
+ } else {
+ WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg));
+ con->in_front_crc = -1;
+ }
+
+ if (middle_len(con->in_msg))
+ con->in_middle_crc = crc32c(-1,
+ con->in_msg->middle->vec.iov_base,
+ middle_len(con->in_msg));
+ else if (data_len(con->in_msg))
+ con->in_middle_crc = -1;
+ else
+ con->in_middle_crc = 0;
+
+ if (!data_len(con->in_msg))
+ con->in_data_crc = 0;
+
+ dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg,
+ con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+
+ if (con->in_front_crc != front_crc) {
+ pr_err("bad front crc, calculated %u, expected %u\n",
+ con->in_front_crc, front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != middle_crc) {
+ pr_err("bad middle crc, calculated %u, expected %u\n",
+ con->in_middle_crc, middle_crc);
+ return -EBADMSG;
+ }
+ if (con->in_data_crc != data_crc) {
+ pr_err("bad data crc, calculated %u, expected %u\n",
+ con->in_data_crc, data_crc);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+static int setup_crypto(struct ceph_connection *con,
+ const u8 *session_key, int session_key_len,
+ const u8 *con_secret, int con_secret_len)
+{
+ unsigned int noio_flag;
+ int ret;
+
+ dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n",
+ __func__, con, con->v2.con_mode, session_key_len, con_secret_len);
+ WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req);
+
+ if (con->v2.con_mode != CEPH_CON_MODE_CRC &&
+ con->v2.con_mode != CEPH_CON_MODE_SECURE) {
+ pr_err("bad con_mode %d\n", con->v2.con_mode);
+ return -EINVAL;
+ }
+
+ if (!session_key_len) {
+ WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC);
+ WARN_ON(con_secret_len);
+ return 0; /* auth_none */
+ }
+
+ noio_flag = memalloc_noio_save();
+ con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
+ memalloc_noio_restore(noio_flag);
+ if (IS_ERR(con->v2.hmac_tfm)) {
+ ret = PTR_ERR(con->v2.hmac_tfm);
+ con->v2.hmac_tfm = NULL;
+ pr_err("failed to allocate hmac tfm context: %d\n", ret);
+ return ret;
+ }
+
+ WARN_ON((unsigned long)session_key &
+ crypto_shash_alignmask(con->v2.hmac_tfm));
+ ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key,
+ session_key_len);
+ if (ret) {
+ pr_err("failed to set hmac key: %d\n", ret);
+ return ret;
+ }
+
+ if (con->v2.con_mode == CEPH_CON_MODE_CRC) {
+ WARN_ON(con_secret_len);
+ return 0; /* auth_x, plain mode */
+ }
+
+ if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) {
+ pr_err("con_secret too small %d\n", con_secret_len);
+ return -EINVAL;
+ }
+
+ noio_flag = memalloc_noio_save();
+ con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+ memalloc_noio_restore(noio_flag);
+ if (IS_ERR(con->v2.gcm_tfm)) {
+ ret = PTR_ERR(con->v2.gcm_tfm);
+ con->v2.gcm_tfm = NULL;
+ pr_err("failed to allocate gcm tfm context: %d\n", ret);
+ return ret;
+ }
+
+ WARN_ON((unsigned long)con_secret &
+ crypto_aead_alignmask(con->v2.gcm_tfm));
+ ret = crypto_aead_setkey(con->v2.gcm_tfm, con_secret, CEPH_GCM_KEY_LEN);
+ if (ret) {
+ pr_err("failed to set gcm key: %d\n", ret);
+ return ret;
+ }
+
+ WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN);
+ ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN);
+ if (ret) {
+ pr_err("failed to set gcm tag size: %d\n", ret);
+ return ret;
+ }
+
+ con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO);
+ if (!con->v2.gcm_req) {
+ pr_err("failed to allocate gcm request\n");
+ return -ENOMEM;
+ }
+
+ crypto_init_wait(&con->v2.gcm_wait);
+ aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &con->v2.gcm_wait);
+
+ memcpy(&con->v2.in_gcm_nonce, con_secret + CEPH_GCM_KEY_LEN,
+ CEPH_GCM_IV_LEN);
+ memcpy(&con->v2.out_gcm_nonce,
+ con_secret + CEPH_GCM_KEY_LEN + CEPH_GCM_IV_LEN,
+ CEPH_GCM_IV_LEN);
+ return 0; /* auth_x, secure mode */
+}
+
+static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs,
+ int kvec_cnt, u8 *hmac)
+{
+ SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */
+ int ret;
+ int i;
+
+ dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con,
+ con->v2.hmac_tfm, kvec_cnt);
+
+ if (!con->v2.hmac_tfm) {
+ memset(hmac, 0, SHA256_DIGEST_SIZE);
+ return 0; /* auth_none */
+ }
+
+ desc->tfm = con->v2.hmac_tfm;
+ ret = crypto_shash_init(desc);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < kvec_cnt; i++) {
+ WARN_ON((unsigned long)kvecs[i].iov_base &
+ crypto_shash_alignmask(con->v2.hmac_tfm));
+ ret = crypto_shash_update(desc, kvecs[i].iov_base,
+ kvecs[i].iov_len);
+ if (ret)
+ goto out;
+ }
+
+ ret = crypto_shash_final(desc, hmac);
+
+out:
+ shash_desc_zero(desc);
+ return ret; /* auth_x, both plain and secure modes */
+}
+
+static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce)
+{
+ u64 counter;
+
+ counter = le64_to_cpu(nonce->counter);
+ nonce->counter = cpu_to_le64(counter + 1);
+}
+
+static int gcm_crypt(struct ceph_connection *con, bool encrypt,
+ struct scatterlist *src, struct scatterlist *dst,
+ int src_len)
+{
+ struct ceph_gcm_nonce *nonce;
+ int ret;
+
+ nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce;
+
+ aead_request_set_ad(con->v2.gcm_req, 0); /* no AAD */
+ aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce);
+ ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) :
+ crypto_aead_decrypt(con->v2.gcm_req),
+ &con->v2.gcm_wait);
+ if (ret)
+ return ret;
+
+ gcm_inc_nonce(nonce);
+ return 0;
+}
+
+static void get_bvec_at(struct ceph_msg_data_cursor *cursor,
+ struct bio_vec *bv)
+{
+ struct page *page;
+ size_t off, len;
+
+ WARN_ON(!cursor->total_resid);
+
+ /* skip zero-length data items */
+ while (!cursor->resid)
+ ceph_msg_data_advance(cursor, 0);
+
+ /* get a piece of data, cursor isn't advanced */
+ page = ceph_msg_data_next(cursor, &off, &len, NULL);
+
+ bv->bv_page = page;
+ bv->bv_offset = off;
+ bv->bv_len = len;
+}
+
+static int calc_sg_cnt(void *buf, int buf_len)
+{
+ int sg_cnt;
+
+ if (!buf_len)
+ return 0;
+
+ sg_cnt = need_padding(buf_len) ? 1 : 0;
+ if (is_vmalloc_addr(buf)) {
+ WARN_ON(offset_in_page(buf));
+ sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT;
+ } else {
+ sg_cnt++;
+ }
+
+ return sg_cnt;
+}
+
+static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor)
+{
+ int data_len = cursor->total_resid;
+ struct bio_vec bv;
+ int sg_cnt;
+
+ if (!data_len)
+ return 0;
+
+ sg_cnt = need_padding(data_len) ? 1 : 0;
+ do {
+ get_bvec_at(cursor, &bv);
+ sg_cnt++;
+
+ ceph_msg_data_advance(cursor, bv.bv_len);
+ } while (cursor->total_resid);
+
+ return sg_cnt;
+}
+
+static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad)
+{
+ void *end = buf + buf_len;
+ struct page *page;
+ int len;
+ void *p;
+
+ if (!buf_len)
+ return;
+
+ if (is_vmalloc_addr(buf)) {
+ p = buf;
+ do {
+ page = vmalloc_to_page(p);
+ len = min_t(int, end - p, PAGE_SIZE);
+ WARN_ON(!page || !len || offset_in_page(p));
+ sg_set_page(*sg, page, len, 0);
+ *sg = sg_next(*sg);
+ p += len;
+ } while (p != end);
+ } else {
+ sg_set_buf(*sg, buf, buf_len);
+ *sg = sg_next(*sg);
+ }
+
+ if (need_padding(buf_len)) {
+ sg_set_buf(*sg, pad, padding_len(buf_len));
+ *sg = sg_next(*sg);
+ }
+}
+
+static void init_sgs_cursor(struct scatterlist **sg,
+ struct ceph_msg_data_cursor *cursor, u8 *pad)
+{
+ int data_len = cursor->total_resid;
+ struct bio_vec bv;
+
+ if (!data_len)
+ return;
+
+ do {
+ get_bvec_at(cursor, &bv);
+ sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+ *sg = sg_next(*sg);
+
+ ceph_msg_data_advance(cursor, bv.bv_len);
+ } while (cursor->total_resid);
+
+ if (need_padding(data_len)) {
+ sg_set_buf(*sg, pad, padding_len(data_len));
+ *sg = sg_next(*sg);
+ }
+}
+
+static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg,
+ u8 *front_pad, u8 *middle_pad, u8 *data_pad,
+ void *epilogue, bool add_tag)
+{
+ struct ceph_msg_data_cursor cursor;
+ struct scatterlist *cur_sg;
+ int sg_cnt;
+ int ret;
+
+ if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+ return 0;
+
+ sg_cnt = 1; /* epilogue + [auth tag] */
+ if (front_len(msg))
+ sg_cnt += calc_sg_cnt(msg->front.iov_base,
+ front_len(msg));
+ if (middle_len(msg))
+ sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base,
+ middle_len(msg));
+ if (data_len(msg)) {
+ ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+ sg_cnt += calc_sg_cnt_cursor(&cursor);
+ }
+
+ ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO);
+ if (ret)
+ return ret;
+
+ cur_sg = sgt->sgl;
+ if (front_len(msg))
+ init_sgs(&cur_sg, msg->front.iov_base, front_len(msg),
+ front_pad);
+ if (middle_len(msg))
+ init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg),
+ middle_pad);
+ if (data_len(msg)) {
+ ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+ init_sgs_cursor(&cur_sg, &cursor, data_pad);
+ }
+
+ WARN_ON(!sg_is_last(cur_sg));
+ sg_set_buf(cur_sg, epilogue,
+ CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0));
+ return 0;
+}
+
+static int decrypt_preamble(struct ceph_connection *con)
+{
+ struct scatterlist sg;
+
+ sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN);
+ return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN);
+}
+
+static int decrypt_control_remainder(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN;
+ struct scatterlist sgs[2];
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len);
+ WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len);
+
+ sg_init_table(sgs, 2);
+ sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len);
+ sg_set_buf(&sgs[1], con->v2.in_buf, pt_len);
+
+ return gcm_crypt(con, false, sgs, sgs,
+ padded_len(rem_len) + CEPH_GCM_TAG_LEN);
+}
+
+static int decrypt_message(struct ceph_connection *con)
+{
+ struct sg_table sgt = {};
+ int ret;
+
+ ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf),
+ MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf),
+ con->v2.in_buf, true);
+ if (ret)
+ goto out;
+
+ ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl,
+ tail_onwire_len(con->in_msg, true));
+
+out:
+ sg_free_table(&sgt);
+ return ret;
+}
+
+static int prepare_banner(struct ceph_connection *con)
+{
+ int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8;
+ void *buf, *p;
+
+ buf = alloc_conn_buf(con, buf_len);
+ if (!buf)
+ return -ENOMEM;
+
+ p = buf;
+ ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN);
+ ceph_encode_16(&p, sizeof(u64) + sizeof(u64));
+ ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES);
+ ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES);
+ WARN_ON(p != buf + buf_len);
+
+ add_out_kvec(con, buf, buf_len);
+ add_out_sign_kvec(con, buf, buf_len);
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+/*
+ * base:
+ * preamble
+ * control body (ctrl_len bytes)
+ * space for control crc
+ *
+ * extdata (optional):
+ * control body (extdata_len bytes)
+ *
+ * Compute control crc and gather base and extdata into:
+ *
+ * preamble
+ * control body (ctrl_len + extdata_len bytes)
+ * control crc
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static void prepare_head_plain(struct ceph_connection *con, void *base,
+ int ctrl_len, void *extdata, int extdata_len,
+ bool to_be_signed)
+{
+ int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN;
+ void *crcp = base + base_len - CEPH_CRC_LEN;
+ u32 crc;
+
+ crc = crc32c(-1, CTRL_BODY(base), ctrl_len);
+ if (extdata_len)
+ crc = crc32c(crc, extdata, extdata_len);
+ put_unaligned_le32(crc, crcp);
+
+ if (!extdata_len) {
+ add_out_kvec(con, base, base_len);
+ if (to_be_signed)
+ add_out_sign_kvec(con, base, base_len);
+ return;
+ }
+
+ add_out_kvec(con, base, crcp - base);
+ add_out_kvec(con, extdata, extdata_len);
+ add_out_kvec(con, crcp, CEPH_CRC_LEN);
+ if (to_be_signed) {
+ add_out_sign_kvec(con, base, crcp - base);
+ add_out_sign_kvec(con, extdata, extdata_len);
+ add_out_sign_kvec(con, crcp, CEPH_CRC_LEN);
+ }
+}
+
+static int prepare_head_secure_small(struct ceph_connection *con,
+ void *base, int ctrl_len)
+{
+ struct scatterlist sg;
+ int ret;
+
+ /* inline buffer padding? */
+ if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN)
+ memset(CTRL_BODY(base) + ctrl_len, 0,
+ CEPH_PREAMBLE_INLINE_LEN - ctrl_len);
+
+ sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN);
+ ret = gcm_crypt(con, true, &sg, &sg,
+ CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN);
+ if (ret)
+ return ret;
+
+ add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN);
+ return 0;
+}
+
+/*
+ * base:
+ * preamble
+ * control body (ctrl_len bytes)
+ * space for padding, if needed
+ * space for control remainder auth tag
+ * space for preamble auth tag
+ *
+ * Encrypt preamble and the inline portion, then encrypt the remainder
+ * and gather into:
+ *
+ * preamble
+ * control body (48 bytes)
+ * preamble auth tag
+ * control body (ctrl_len - 48 bytes)
+ * zero padding, if needed
+ * control remainder auth tag
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static int prepare_head_secure_big(struct ceph_connection *con,
+ void *base, int ctrl_len)
+{
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN;
+ void *rem_tag = rem + padded_len(rem_len);
+ void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN;
+ struct scatterlist sgs[2];
+ int ret;
+
+ sg_init_table(sgs, 2);
+ sg_set_buf(&sgs[0], base, rem - base);
+ sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN);
+ ret = gcm_crypt(con, true, sgs, sgs, rem - base);
+ if (ret)
+ return ret;
+
+ /* control remainder padding? */
+ if (need_padding(rem_len))
+ memset(rem + rem_len, 0, padding_len(rem_len));
+
+ sg_init_one(&sgs[0], rem, pmbl_tag - rem);
+ ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem);
+ if (ret)
+ return ret;
+
+ add_out_kvec(con, base, rem - base);
+ add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN);
+ add_out_kvec(con, rem, pmbl_tag - rem);
+ return 0;
+}
+
+static int __prepare_control(struct ceph_connection *con, int tag,
+ void *base, int ctrl_len, void *extdata,
+ int extdata_len, bool to_be_signed)
+{
+ int total_len = ctrl_len + extdata_len;
+ struct ceph_frame_desc desc;
+ int ret;
+
+ dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag,
+ total_len, ctrl_len, extdata_len);
+
+ /* extdata may be vmalloc'ed but not base */
+ if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len))
+ return -EINVAL;
+
+ init_frame_desc(&desc, tag, &total_len, 1);
+ encode_preamble(&desc, base);
+
+ if (con_secure(con)) {
+ if (WARN_ON(extdata_len || to_be_signed))
+ return -EINVAL;
+
+ if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN)
+ /* fully inlined, inline buffer may need padding */
+ ret = prepare_head_secure_small(con, base, ctrl_len);
+ else
+ /* partially inlined, inline buffer is full */
+ ret = prepare_head_secure_big(con, base, ctrl_len);
+ if (ret)
+ return ret;
+ } else {
+ prepare_head_plain(con, base, ctrl_len, extdata, extdata_len,
+ to_be_signed);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+static int prepare_control(struct ceph_connection *con, int tag,
+ void *base, int ctrl_len)
+{
+ return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false);
+}
+
+static int prepare_hello(struct ceph_connection *con)
+{
+ void *buf, *p;
+ int ctrl_len;
+
+ ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr);
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT);
+ ceph_encode_entity_addr(&p, &con->peer_addr);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len,
+ NULL, 0, true);
+}
+
+/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */
+#define AUTH_BUF_LEN (512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN)
+
+static int prepare_auth_request(struct ceph_connection *con)
+{
+ void *authorizer, *authorizer_copy;
+ int ctrl_len, authorizer_len;
+ void *buf;
+ int ret;
+
+ ctrl_len = AUTH_BUF_LEN;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len,
+ &authorizer, &authorizer_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_HELLO) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p get_auth_request ret %d\n", __func__, con, ret);
+ if (ret)
+ return ret;
+
+ authorizer_copy = alloc_conn_buf(con, authorizer_len);
+ if (!authorizer_copy)
+ return -ENOMEM;
+
+ memcpy(authorizer_copy, authorizer, authorizer_len);
+
+ return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len,
+ authorizer_copy, authorizer_len, true);
+}
+
+static int prepare_auth_request_more(struct ceph_connection *con,
+ void *reply, int reply_len)
+{
+ int ctrl_len, authorizer_len;
+ void *authorizer;
+ void *buf;
+ int ret;
+
+ ctrl_len = AUTH_BUF_LEN;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->handle_auth_reply_more(con, reply, reply_len,
+ CTRL_BODY(buf), &ctrl_len,
+ &authorizer, &authorizer_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret);
+ if (ret)
+ return ret;
+
+ return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf,
+ ctrl_len, authorizer, authorizer_len, true);
+}
+
+static int prepare_auth_signature(struct ceph_connection *con)
+{
+ void *buf;
+ int ret;
+
+ buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE,
+ con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt,
+ CTRL_BODY(buf));
+ if (ret)
+ return ret;
+
+ return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf,
+ SHA256_DIGEST_SIZE);
+}
+
+static int prepare_client_ident(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 global_id = ceph_client_gid(client);
+ void *buf, *p;
+ int ctrl_len;
+
+ WARN_ON(con->v2.server_cookie);
+ WARN_ON(con->v2.connect_seq);
+ WARN_ON(con->v2.peer_global_seq);
+
+ if (!con->v2.client_cookie) {
+ do {
+ get_random_bytes(&con->v2.client_cookie,
+ sizeof(con->v2.client_cookie));
+ } while (!con->v2.client_cookie);
+ dout("%s con %p generated cookie 0x%llx\n", __func__, con,
+ con->v2.client_cookie);
+ } else {
+ dout("%s con %p cookie already set 0x%llx\n", __func__, con,
+ con->v2.client_cookie);
+ }
+
+ dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n",
+ __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+ ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce),
+ global_id, con->v2.global_seq, client->supported_features,
+ client->required_features, con->v2.client_cookie);
+
+ ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) +
+ ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, 2); /* addrvec marker */
+ ceph_encode_32(&p, 1); /* addr_cnt */
+ ceph_encode_entity_addr(&p, my_addr);
+ ceph_encode_entity_addr(&p, &con->peer_addr);
+ ceph_encode_64(&p, global_id);
+ ceph_encode_64(&p, con->v2.global_seq);
+ ceph_encode_64(&p, client->supported_features);
+ ceph_encode_64(&p, client->required_features);
+ ceph_encode_64(&p, 0); /* flags */
+ ceph_encode_64(&p, con->v2.client_cookie);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len);
+}
+
+static int prepare_session_reconnect(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ void *buf, *p;
+ int ctrl_len;
+
+ WARN_ON(!con->v2.client_cookie);
+ WARN_ON(!con->v2.server_cookie);
+ WARN_ON(!con->v2.connect_seq);
+ WARN_ON(!con->v2.peer_global_seq);
+
+ dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n",
+ __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+ con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq,
+ con->v2.connect_seq, con->in_seq);
+
+ ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, 2); /* entity_addrvec_t marker */
+ ceph_encode_32(&p, 1); /* my_addrs len */
+ ceph_encode_entity_addr(&p, my_addr);
+ ceph_encode_64(&p, con->v2.client_cookie);
+ ceph_encode_64(&p, con->v2.server_cookie);
+ ceph_encode_64(&p, con->v2.global_seq);
+ ceph_encode_64(&p, con->v2.connect_seq);
+ ceph_encode_64(&p, con->in_seq);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len);
+}
+
+static int prepare_keepalive2(struct ceph_connection *con)
+{
+ struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf);
+ struct timespec64 now;
+
+ ktime_get_real_ts64(&now);
+ dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec,
+ now.tv_nsec);
+
+ ceph_encode_timespec64(ts, &now);
+
+ reset_out_kvecs(con);
+ return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf,
+ sizeof(struct ceph_timespec));
+}
+
+static int prepare_ack(struct ceph_connection *con)
+{
+ void *p;
+
+ dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ p = CTRL_BODY(con->v2.out_buf);
+ ceph_encode_64(&p, con->in_seq_acked);
+
+ reset_out_kvecs(con);
+ return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8);
+}
+
+static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+ dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con,
+ con->out_msg, aborted, con->v2.out_epil.front_crc,
+ con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc);
+
+ encode_epilogue_plain(con, aborted);
+ add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN);
+}
+
+/*
+ * For "used" empty segments, crc is -1. For unused (trailing)
+ * segments, crc is 0.
+ */
+static void prepare_message_plain(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+
+ prepare_head_plain(con, con->v2.out_buf,
+ sizeof(struct ceph_msg_header2), NULL, 0, false);
+
+ if (!front_len(msg) && !middle_len(msg)) {
+ if (!data_len(msg)) {
+ /*
+ * Empty message: once the head is written,
+ * we are done -- there is no epilogue.
+ */
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ return;
+ }
+
+ con->v2.out_epil.front_crc = -1;
+ con->v2.out_epil.middle_crc = -1;
+ con->v2.out_state = OUT_S_QUEUE_DATA;
+ return;
+ }
+
+ if (front_len(msg)) {
+ con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base,
+ front_len(msg));
+ add_out_kvec(con, msg->front.iov_base, front_len(msg));
+ } else {
+ /* middle (at least) is there, checked above */
+ con->v2.out_epil.front_crc = -1;
+ }
+
+ if (middle_len(msg)) {
+ con->v2.out_epil.middle_crc =
+ crc32c(-1, msg->middle->vec.iov_base, middle_len(msg));
+ add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+ } else {
+ con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0;
+ }
+
+ if (data_len(msg)) {
+ con->v2.out_state = OUT_S_QUEUE_DATA;
+ } else {
+ con->v2.out_epil.data_crc = 0;
+ prepare_epilogue_plain(con, false);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ }
+}
+
+/*
+ * Unfortunately the kernel crypto API doesn't support streaming
+ * (piecewise) operation for AEAD algorithms, so we can't get away
+ * with a fixed size buffer and a couple sgs. Instead, we have to
+ * allocate pages for the entire tail of the message (currently up
+ * to ~32M) and two sgs arrays (up to ~256K each)...
+ */
+static int prepare_message_secure(struct ceph_connection *con)
+{
+ void *zerop = page_address(ceph_zero_page);
+ struct sg_table enc_sgt = {};
+ struct sg_table sgt = {};
+ struct page **enc_pages;
+ int enc_page_cnt;
+ int tail_len;
+ int ret;
+
+ ret = prepare_head_secure_small(con, con->v2.out_buf,
+ sizeof(struct ceph_msg_header2));
+ if (ret)
+ return ret;
+
+ tail_len = tail_onwire_len(con->out_msg, true);
+ if (!tail_len) {
+ /*
+ * Empty message: once the head is written,
+ * we are done -- there is no epilogue.
+ */
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ return 0;
+ }
+
+ encode_epilogue_secure(con, false);
+ ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop,
+ &con->v2.out_epil, false);
+ if (ret)
+ goto out;
+
+ enc_page_cnt = calc_pages_for(0, tail_len);
+ enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+ if (IS_ERR(enc_pages)) {
+ ret = PTR_ERR(enc_pages);
+ goto out;
+ }
+
+ WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = enc_pages;
+ con->v2.out_enc_page_cnt = enc_page_cnt;
+ con->v2.out_enc_resid = tail_len;
+ con->v2.out_enc_i = 0;
+
+ ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt,
+ 0, tail_len, GFP_NOIO);
+ if (ret)
+ goto out;
+
+ ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl,
+ tail_len - CEPH_GCM_TAG_LEN);
+ if (ret)
+ goto out;
+
+ dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con,
+ con->out_msg, sgt.orig_nents, enc_page_cnt);
+ con->v2.out_state = OUT_S_QUEUE_ENC_PAGE;
+
+out:
+ sg_free_table(&sgt);
+ sg_free_table(&enc_sgt);
+ return ret;
+}
+
+static int prepare_message(struct ceph_connection *con)
+{
+ int lens[] = {
+ sizeof(struct ceph_msg_header2),
+ front_len(con->out_msg),
+ middle_len(con->out_msg),
+ data_len(con->out_msg)
+ };
+ struct ceph_frame_desc desc;
+ int ret;
+
+ dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con,
+ con->out_msg, lens[0], lens[1], lens[2], lens[3]);
+
+ if (con->in_seq > con->in_seq_acked) {
+ dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+ }
+
+ reset_out_kvecs(con);
+ init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4);
+ encode_preamble(&desc, con->v2.out_buf);
+ fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr,
+ con->in_seq_acked);
+
+ if (con_secure(con)) {
+ ret = prepare_message_secure(con);
+ if (ret)
+ return ret;
+ } else {
+ prepare_message_plain(con);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+static int prepare_read_banner_prefix(struct ceph_connection *con)
+{
+ void *buf;
+
+ buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN);
+ if (!buf)
+ return -ENOMEM;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+ add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+ con->state = CEPH_CON_S_V2_BANNER_PREFIX;
+ return 0;
+}
+
+static int prepare_read_banner_payload(struct ceph_connection *con,
+ int payload_len)
+{
+ void *buf;
+
+ buf = alloc_conn_buf(con, payload_len);
+ if (!buf)
+ return -ENOMEM;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, payload_len);
+ add_in_sign_kvec(con, buf, payload_len);
+ con->state = CEPH_CON_S_V2_BANNER_PAYLOAD;
+ return 0;
+}
+
+static void prepare_read_preamble(struct ceph_connection *con)
+{
+ reset_in_kvecs(con);
+ add_in_kvec(con, con->v2.in_buf,
+ con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN :
+ CEPH_PREAMBLE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_PREAMBLE;
+}
+
+static int prepare_read_control(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int head_len;
+ void *buf;
+
+ reset_in_kvecs(con);
+ if (con->state == CEPH_CON_S_V2_HELLO ||
+ con->state == CEPH_CON_S_V2_AUTH) {
+ head_len = head_onwire_len(ctrl_len, false);
+ buf = alloc_conn_buf(con, head_len);
+ if (!buf)
+ return -ENOMEM;
+
+ /* preserve preamble */
+ memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN);
+
+ add_in_kvec(con, CTRL_BODY(buf), ctrl_len);
+ add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN);
+ add_in_sign_kvec(con, buf, head_len);
+ } else {
+ if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ add_in_kvec(con, buf, ctrl_len);
+ } else {
+ add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len);
+ }
+ add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN);
+ }
+ con->v2.in_state = IN_S_HANDLE_CONTROL;
+ return 0;
+}
+
+static int prepare_read_control_remainder(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ void *buf;
+
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN);
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len);
+ add_in_kvec(con, con->v2.in_buf,
+ padding_len(rem_len) + CEPH_GCM_TAG_LEN);
+ con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER;
+ return 0;
+}
+
+static void prepare_read_data(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ if (!con_secure(con))
+ con->in_data_crc = -1;
+ ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg,
+ data_len(con->in_msg));
+
+ get_bvec_at(&con->v2.in_cursor, &bv);
+ set_in_bvec(con, &bv);
+ con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
+}
+
+static void prepare_read_data_cont(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ if (!con_secure(con))
+ con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+ con->v2.in_bvec.bv_page,
+ con->v2.in_bvec.bv_offset,
+ con->v2.in_bvec.bv_len);
+
+ ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
+ if (con->v2.in_cursor.total_resid) {
+ get_bvec_at(&con->v2.in_cursor, &bv);
+ set_in_bvec(con, &bv);
+ WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
+ return;
+ }
+
+ /*
+ * We've read all data. Prepare to read data padding (if any)
+ * and epilogue.
+ */
+ reset_in_kvecs(con);
+ if (con_secure(con)) {
+ if (need_padding(data_len(con->in_msg)))
+ add_in_kvec(con, DATA_PAD(con->v2.in_buf),
+ padding_len(data_len(con->in_msg)));
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN);
+ } else {
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+ }
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static void __finish_skip(struct ceph_connection *con)
+{
+ con->in_seq++;
+ prepare_read_preamble(con);
+}
+
+static void prepare_skip_message(struct ceph_connection *con)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ int tail_len;
+
+ dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1],
+ desc->fd_lens[2], desc->fd_lens[3]);
+
+ tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2],
+ desc->fd_lens[3], con_secure(con));
+ if (!tail_len) {
+ __finish_skip(con);
+ } else {
+ set_in_skip(con, tail_len);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+ }
+}
+
+static int process_banner_prefix(struct ceph_connection *con)
+{
+ int payload_len;
+ void *p;
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN);
+
+ p = con->v2.in_kvecs[0].iov_base;
+ if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) {
+ if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN))
+ con->error_msg = "server is speaking msgr1 protocol";
+ else
+ con->error_msg = "protocol error, bad banner";
+ return -EINVAL;
+ }
+
+ p += CEPH_BANNER_V2_LEN;
+ payload_len = ceph_decode_16(&p);
+ dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+ return prepare_read_banner_payload(con, payload_len);
+}
+
+static int process_banner_payload(struct ceph_connection *con)
+{
+ void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len;
+ u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES;
+ u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES;
+ u64 server_feat, server_req_feat;
+ void *p;
+ int ret;
+
+ p = con->v2.in_kvecs[0].iov_base;
+ ceph_decode_64_safe(&p, end, server_feat, bad);
+ ceph_decode_64_safe(&p, end, server_req_feat, bad);
+
+ dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n",
+ __func__, con, server_feat, server_req_feat);
+
+ if (req_feat & ~server_feat) {
+ pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+ server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+ if (server_req_feat & ~feat) {
+ pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+ feat, server_req_feat & ~feat);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+
+ /* no reset_out_kvecs() as our banner may still be pending */
+ ret = prepare_hello(con);
+ if (ret) {
+ pr_err("prepare_hello failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_HELLO;
+ prepare_read_preamble(con);
+ return 0;
+
+bad:
+ pr_err("failed to decode banner payload\n");
+ return -EINVAL;
+}
+
+static int process_hello(struct ceph_connection *con, void *p, void *end)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ struct ceph_entity_addr addr_for_me;
+ u8 entity_type;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_HELLO) {
+ con->error_msg = "protocol error, unexpected hello";
+ return -EINVAL;
+ }
+
+ ceph_decode_8_safe(&p, end, entity_type, bad);
+ ret = ceph_decode_entity_addr(&p, end, &addr_for_me);
+ if (ret) {
+ pr_err("failed to decode addr_for_me: %d\n", ret);
+ return ret;
+ }
+
+ dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con,
+ entity_type, ceph_pr_addr(&addr_for_me));
+
+ if (entity_type != con->peer_name.type) {
+ pr_err("bad peer type, want %d, got %d\n",
+ con->peer_name.type, entity_type);
+ con->error_msg = "wrong peer at address";
+ return -EINVAL;
+ }
+
+ /*
+ * Set our address to the address our first peer (i.e. monitor)
+ * sees that we are connecting from. If we are behind some sort
+ * of NAT and want to be identified by some private (not NATed)
+ * address, ip option should be used.
+ */
+ if (ceph_addr_is_blank(my_addr)) {
+ memcpy(&my_addr->in_addr, &addr_for_me.in_addr,
+ sizeof(my_addr->in_addr));
+ ceph_addr_set_port(my_addr, 0);
+ dout("%s con %p set my addr %s, as seen by peer %s\n",
+ __func__, con, ceph_pr_addr(my_addr),
+ ceph_pr_addr(&con->peer_addr));
+ } else {
+ dout("%s con %p my addr already set %s\n",
+ __func__, con, ceph_pr_addr(my_addr));
+ }
+
+ WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr));
+ WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY);
+ WARN_ON(!my_addr->nonce);
+
+ /* no reset_out_kvecs() as our hello may still be pending */
+ ret = prepare_auth_request(con);
+ if (ret) {
+ if (ret != -EAGAIN)
+ pr_err("prepare_auth_request failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_AUTH;
+ return 0;
+
+bad:
+ pr_err("failed to decode hello\n");
+ return -EINVAL;
+}
+
+static int process_auth_bad_method(struct ceph_connection *con,
+ void *p, void *end)
+{
+ int allowed_protos[8], allowed_modes[8];
+ int allowed_proto_cnt, allowed_mode_cnt;
+ int used_proto, result;
+ int ret;
+ int i;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_bad_method";
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(&p, end, used_proto, bad);
+ ceph_decode_32_safe(&p, end, result, bad);
+ dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto,
+ result);
+
+ ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad);
+ if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) {
+ pr_err("allowed_protos too big %d\n", allowed_proto_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < allowed_proto_cnt; i++) {
+ ceph_decode_32_safe(&p, end, allowed_protos[i], bad);
+ dout("%s con %p allowed_protos[%d] %d\n", __func__, con,
+ i, allowed_protos[i]);
+ }
+
+ ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad);
+ if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) {
+ pr_err("allowed_modes too big %d\n", allowed_mode_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < allowed_mode_cnt; i++) {
+ ceph_decode_32_safe(&p, end, allowed_modes[i], bad);
+ dout("%s con %p allowed_modes[%d] %d\n", __func__, con,
+ i, allowed_modes[i]);
+ }
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->handle_auth_bad_method(con, used_proto, result,
+ allowed_protos,
+ allowed_proto_cnt,
+ allowed_modes,
+ allowed_mode_cnt);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret);
+ return ret;
+
+bad:
+ pr_err("failed to decode auth_bad_method\n");
+ return -EINVAL;
+}
+
+static int process_auth_reply_more(struct ceph_connection *con,
+ void *p, void *end)
+{
+ int payload_len;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_reply_more";
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+ ceph_decode_need(&p, end, payload_len, bad);
+
+ dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+ reset_out_kvecs(con);
+ ret = prepare_auth_request_more(con, p, payload_len);
+ if (ret) {
+ if (ret != -EAGAIN)
+ pr_err("prepare_auth_request_more failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode auth_reply_more\n");
+ return -EINVAL;
+}
+
+/*
+ * Align session_key and con_secret to avoid GFP_ATOMIC allocation
+ * inside crypto_shash_setkey() and crypto_aead_setkey() called from
+ * setup_crypto(). __aligned(16) isn't guaranteed to work for stack
+ * objects, so do it by hand.
+ */
+static int process_auth_done(struct ceph_connection *con, void *p, void *end)
+{
+ u8 session_key_buf[CEPH_KEY_LEN + 16];
+ u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16];
+ u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16);
+ u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16);
+ int session_key_len, con_secret_len;
+ int payload_len;
+ u64 global_id;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_done";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, global_id, bad);
+ ceph_decode_32_safe(&p, end, con->v2.con_mode, bad);
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+
+ dout("%s con %p global_id %llu con_mode %d payload_len %d\n",
+ __func__, con, global_id, con->v2.con_mode, payload_len);
+
+ mutex_unlock(&con->mutex);
+ session_key_len = 0;
+ con_secret_len = 0;
+ ret = con->ops->handle_auth_done(con, global_id, p, payload_len,
+ session_key, &session_key_len,
+ con_secret, &con_secret_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret);
+ if (ret)
+ goto out;
+
+ ret = setup_crypto(con, session_key, session_key_len, con_secret,
+ con_secret_len);
+ if (ret)
+ goto out;
+
+ reset_out_kvecs(con);
+ ret = prepare_auth_signature(con);
+ if (ret) {
+ pr_err("prepare_auth_signature failed: %d\n", ret);
+ goto out;
+ }
+
+ con->state = CEPH_CON_S_V2_AUTH_SIGNATURE;
+
+out:
+ memzero_explicit(session_key_buf, sizeof(session_key_buf));
+ memzero_explicit(con_secret_buf, sizeof(con_secret_buf));
+ return ret;
+
+bad:
+ pr_err("failed to decode auth_done\n");
+ return -EINVAL;
+}
+
+static int process_auth_signature(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u8 hmac[SHA256_DIGEST_SIZE];
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) {
+ con->error_msg = "protocol error, unexpected auth_signature";
+ return -EINVAL;
+ }
+
+ ret = hmac_sha256(con, con->v2.out_sign_kvecs,
+ con->v2.out_sign_kvec_cnt, hmac);
+ if (ret)
+ return ret;
+
+ ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad);
+ if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) {
+ con->error_msg = "integrity error, bad auth signature";
+ return -EBADMSG;
+ }
+
+ dout("%s con %p auth signature ok\n", __func__, con);
+
+ /* no reset_out_kvecs() as our auth_signature may still be pending */
+ if (!con->v2.server_cookie) {
+ ret = prepare_client_ident(con);
+ if (ret) {
+ pr_err("prepare_client_ident failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+ } else {
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_RECONNECT;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode auth_signature\n");
+ return -EINVAL;
+}
+
+static int process_server_ident(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 features, required_features;
+ struct ceph_entity_addr addr;
+ u64 global_seq;
+ u64 global_id;
+ u64 cookie;
+ u64 flags;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+ con->error_msg = "protocol error, unexpected server_ident";
+ return -EINVAL;
+ }
+
+ ret = ceph_decode_entity_addrvec(&p, end, true, &addr);
+ if (ret) {
+ pr_err("failed to decode server addrs: %d\n", ret);
+ return ret;
+ }
+
+ ceph_decode_64_safe(&p, end, global_id, bad);
+ ceph_decode_64_safe(&p, end, global_seq, bad);
+ ceph_decode_64_safe(&p, end, features, bad);
+ ceph_decode_64_safe(&p, end, required_features, bad);
+ ceph_decode_64_safe(&p, end, flags, bad);
+ ceph_decode_64_safe(&p, end, cookie, bad);
+
+ dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n",
+ __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce),
+ global_id, global_seq, features, required_features, flags, cookie);
+
+ /* is this who we intended to talk to? */
+ if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) {
+ pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n",
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->peer_addr.nonce),
+ ceph_pr_addr(&addr), le32_to_cpu(addr.nonce));
+ con->error_msg = "wrong peer at address";
+ return -EINVAL;
+ }
+
+ if (client->required_features & ~features) {
+ pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+ features, client->required_features & ~features);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+
+ /*
+ * Both name->type and name->num are set in ceph_con_open() but
+ * name->num may be bogus in the initial monmap. name->type is
+ * verified in handle_hello().
+ */
+ WARN_ON(!con->peer_name.type);
+ con->peer_name.num = cpu_to_le64(global_id);
+ con->v2.peer_global_seq = global_seq;
+ con->peer_features = features;
+ WARN_ON(required_features & ~client->supported_features);
+ con->v2.server_cookie = cookie;
+
+ if (flags & CEPH_MSG_CONNECT_LOSSY) {
+ ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+ WARN_ON(con->v2.server_cookie);
+ } else {
+ WARN_ON(!con->v2.server_cookie);
+ }
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+ con->delay = 0; /* reset backoff memory */
+
+ con->state = CEPH_CON_S_OPEN;
+ con->v2.out_state = OUT_S_GET_NEXT;
+ return 0;
+
+bad:
+ pr_err("failed to decode server_ident\n");
+ return -EINVAL;
+}
+
+static int process_ident_missing_features(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 missing_features;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+ con->error_msg = "protocol error, unexpected ident_missing_features";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, missing_features, bad);
+ pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+ client->supported_features, missing_features);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+
+bad:
+ pr_err("failed to decode ident_missing_features\n");
+ return -EINVAL;
+}
+
+static int process_session_reconnect_ok(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 seq;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_reconnect_ok";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, seq, bad);
+
+ dout("%s con %p seq %llu\n", __func__, con, seq);
+ ceph_con_discard_requeued(con, seq);
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+ con->delay = 0; /* reset backoff memory */
+
+ con->state = CEPH_CON_S_OPEN;
+ con->v2.out_state = OUT_S_GET_NEXT;
+ return 0;
+
+bad:
+ pr_err("failed to decode session_reconnect_ok\n");
+ return -EINVAL;
+}
+
+static int process_session_retry(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 connect_seq;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_retry";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, connect_seq, bad);
+
+ dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq);
+ WARN_ON(connect_seq <= con->v2.connect_seq);
+ con->v2.connect_seq = connect_seq + 1;
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode session_retry\n");
+ return -EINVAL;
+}
+
+static int process_session_retry_global(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 global_seq;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_retry_global";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, global_seq, bad);
+
+ dout("%s con %p global_seq %llu\n", __func__, con, global_seq);
+ WARN_ON(global_seq <= con->v2.global_seq);
+ con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq);
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode session_retry_global\n");
+ return -EINVAL;
+}
+
+static int process_session_reset(struct ceph_connection *con,
+ void *p, void *end)
+{
+ bool full;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_reset";
+ return -EINVAL;
+ }
+
+ ceph_decode_8_safe(&p, end, full, bad);
+ if (!full) {
+ con->error_msg = "protocol error, bad session_reset";
+ return -EINVAL;
+ }
+
+ pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr));
+ ceph_con_reset_session(con);
+
+ mutex_unlock(&con->mutex);
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_client_ident(con);
+ if (ret) {
+ pr_err("prepare_client_ident (rst) failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+ return 0;
+
+bad:
+ pr_err("failed to decode session_reset\n");
+ return -EINVAL;
+}
+
+static int process_keepalive2_ack(struct ceph_connection *con,
+ void *p, void *end)
+{
+ if (con->state != CEPH_CON_S_OPEN) {
+ con->error_msg = "protocol error, unexpected keepalive2_ack";
+ return -EINVAL;
+ }
+
+ ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad);
+ ceph_decode_timespec64(&con->last_keepalive_ack, p);
+
+ dout("%s con %p timestamp %lld.%09ld\n", __func__, con,
+ con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec);
+
+ return 0;
+
+bad:
+ pr_err("failed to decode keepalive2_ack\n");
+ return -EINVAL;
+}
+
+static int process_ack(struct ceph_connection *con, void *p, void *end)
+{
+ u64 seq;
+
+ if (con->state != CEPH_CON_S_OPEN) {
+ con->error_msg = "protocol error, unexpected ack";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, seq, bad);
+
+ dout("%s con %p seq %llu\n", __func__, con, seq);
+ ceph_con_discard_sent(con, seq);
+ return 0;
+
+bad:
+ pr_err("failed to decode ack\n");
+ return -EINVAL;
+}
+
+static int process_control(struct ceph_connection *con, void *p, void *end)
+{
+ int tag = con->v2.in_desc.fd_tag;
+ int ret;
+
+ dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p));
+
+ switch (tag) {
+ case FRAME_TAG_HELLO:
+ ret = process_hello(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_BAD_METHOD:
+ ret = process_auth_bad_method(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_REPLY_MORE:
+ ret = process_auth_reply_more(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_DONE:
+ ret = process_auth_done(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_SIGNATURE:
+ ret = process_auth_signature(con, p, end);
+ break;
+ case FRAME_TAG_SERVER_IDENT:
+ ret = process_server_ident(con, p, end);
+ break;
+ case FRAME_TAG_IDENT_MISSING_FEATURES:
+ ret = process_ident_missing_features(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RECONNECT_OK:
+ ret = process_session_reconnect_ok(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RETRY:
+ ret = process_session_retry(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RETRY_GLOBAL:
+ ret = process_session_retry_global(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RESET:
+ ret = process_session_reset(con, p, end);
+ break;
+ case FRAME_TAG_KEEPALIVE2_ACK:
+ ret = process_keepalive2_ack(con, p, end);
+ break;
+ case FRAME_TAG_ACK:
+ ret = process_ack(con, p, end);
+ break;
+ default:
+ pr_err("bad tag %d\n", tag);
+ con->error_msg = "protocol error, bad tag";
+ return -EINVAL;
+ }
+ if (ret) {
+ dout("%s con %p error %d\n", __func__, con, ret);
+ return ret;
+ }
+
+ prepare_read_preamble(con);
+ return 0;
+}
+
+/*
+ * Return:
+ * 1 - con->in_msg set, read message
+ * 0 - skip message
+ * <0 - error
+ */
+static int process_message_header(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ struct ceph_msg_header2 *hdr2 = p;
+ struct ceph_msg_header hdr;
+ int skip;
+ int ret;
+ u64 seq;
+
+ /* verify seq# */
+ seq = le64_to_cpu(hdr2->seq);
+ if ((s64)seq - (s64)con->in_seq < 1) {
+ pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ seq, con->in_seq + 1);
+ return 0;
+ }
+ if ((s64)seq - (s64)con->in_seq > 1) {
+ pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1);
+ con->error_msg = "bad message sequence # for incoming message";
+ return -EBADE;
+ }
+
+ ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq));
+
+ fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2],
+ desc->fd_lens[3], &con->peer_name);
+ ret = ceph_con_in_msg_alloc(con, &hdr, &skip);
+ if (ret)
+ return ret;
+
+ WARN_ON(!con->in_msg ^ skip);
+ if (skip)
+ return 0;
+
+ WARN_ON(!con->in_msg);
+ WARN_ON(con->in_msg->con != con);
+ return 1;
+}
+
+static int process_message(struct ceph_connection *con)
+{
+ ceph_con_process_message(con);
+
+ /*
+ * We could have been closed by ceph_con_close() because
+ * ceph_con_process_message() temporarily drops con->mutex.
+ */
+ if (con->state != CEPH_CON_S_OPEN) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ prepare_read_preamble(con);
+ return 0;
+}
+
+static int __handle_control(struct ceph_connection *con, void *p)
+{
+ void *end = p + con->v2.in_desc.fd_lens[0];
+ struct ceph_msg *msg;
+ int ret;
+
+ if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE)
+ return process_control(con, p, end);
+
+ ret = process_message_header(con, p, end);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ prepare_skip_message(con);
+ return 0;
+ }
+
+ msg = con->in_msg; /* set in process_message_header() */
+ if (!front_len(msg) && !middle_len(msg)) {
+ if (!data_len(msg))
+ return process_message(con);
+
+ prepare_read_data(con);
+ return 0;
+ }
+
+ reset_in_kvecs(con);
+ if (front_len(msg)) {
+ WARN_ON(front_len(msg) > msg->front_alloc_len);
+ add_in_kvec(con, msg->front.iov_base, front_len(msg));
+ msg->front.iov_len = front_len(msg);
+
+ if (con_secure(con) && need_padding(front_len(msg)))
+ add_in_kvec(con, FRONT_PAD(con->v2.in_buf),
+ padding_len(front_len(msg)));
+ } else {
+ msg->front.iov_len = 0;
+ }
+ if (middle_len(msg)) {
+ WARN_ON(middle_len(msg) > msg->middle->alloc_len);
+ add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+ msg->middle->vec.iov_len = middle_len(msg);
+
+ if (con_secure(con) && need_padding(middle_len(msg)))
+ add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf),
+ padding_len(middle_len(msg)));
+ } else if (msg->middle) {
+ msg->middle->vec.iov_len = 0;
+ }
+
+ if (data_len(msg)) {
+ con->v2.in_state = IN_S_PREPARE_READ_DATA;
+ } else {
+ add_in_kvec(con, con->v2.in_buf,
+ con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN :
+ CEPH_EPILOGUE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+ }
+ return 0;
+}
+
+static int handle_preamble(struct ceph_connection *con)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ int ret;
+
+ if (con_secure(con)) {
+ ret = decrypt_preamble(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad preamble auth tag";
+ return ret;
+ }
+ }
+
+ ret = decode_preamble(con->v2.in_buf, desc);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad crc";
+ else
+ con->error_msg = "protocol error, bad preamble";
+ return ret;
+ }
+
+ dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__,
+ con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0],
+ desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]);
+
+ if (!con_secure(con))
+ return prepare_read_control(con);
+
+ if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN)
+ return prepare_read_control_remainder(con);
+
+ return __handle_control(con, CTRL_BODY(con->v2.in_buf));
+}
+
+static int handle_control(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ void *buf;
+ int ret;
+
+ WARN_ON(con_secure(con));
+
+ ret = verify_control_crc(con);
+ if (ret) {
+ con->error_msg = "integrity error, bad crc";
+ return ret;
+ }
+
+ if (con->state == CEPH_CON_S_V2_AUTH) {
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len);
+ return __handle_control(con, buf);
+ }
+
+ return __handle_control(con, con->v2.in_kvecs[0].iov_base);
+}
+
+static int handle_control_remainder(struct ceph_connection *con)
+{
+ int ret;
+
+ WARN_ON(!con_secure(con));
+
+ ret = decrypt_control_remainder(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad control remainder auth tag";
+ return ret;
+ }
+
+ return __handle_control(con, con->v2.in_kvecs[0].iov_base -
+ CEPH_PREAMBLE_INLINE_LEN);
+}
+
+static int handle_epilogue(struct ceph_connection *con)
+{
+ u32 front_crc, middle_crc, data_crc;
+ int ret;
+
+ if (con_secure(con)) {
+ ret = decrypt_message(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad epilogue auth tag";
+ return ret;
+ }
+
+ /* just late_status */
+ ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL);
+ if (ret) {
+ con->error_msg = "protocol error, bad epilogue";
+ return ret;
+ }
+ } else {
+ ret = decode_epilogue(con->v2.in_buf, &front_crc,
+ &middle_crc, &data_crc);
+ if (ret) {
+ con->error_msg = "protocol error, bad epilogue";
+ return ret;
+ }
+
+ ret = verify_epilogue_crcs(con, front_crc, middle_crc,
+ data_crc);
+ if (ret) {
+ con->error_msg = "integrity error, bad crc";
+ return ret;
+ }
+ }
+
+ return process_message(con);
+}
+
+static void finish_skip(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+
+ if (con_secure(con))
+ gcm_inc_nonce(&con->v2.in_gcm_nonce);
+
+ __finish_skip(con);
+}
+
+static int populate_in_iter(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d in_state %d\n", __func__, con, con->state,
+ con->v2.in_state);
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) {
+ ret = process_banner_prefix(con);
+ } else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) {
+ ret = process_banner_payload(con);
+ } else if ((con->state >= CEPH_CON_S_V2_HELLO &&
+ con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) ||
+ con->state == CEPH_CON_S_OPEN) {
+ switch (con->v2.in_state) {
+ case IN_S_HANDLE_PREAMBLE:
+ ret = handle_preamble(con);
+ break;
+ case IN_S_HANDLE_CONTROL:
+ ret = handle_control(con);
+ break;
+ case IN_S_HANDLE_CONTROL_REMAINDER:
+ ret = handle_control_remainder(con);
+ break;
+ case IN_S_PREPARE_READ_DATA:
+ prepare_read_data(con);
+ ret = 0;
+ break;
+ case IN_S_PREPARE_READ_DATA_CONT:
+ prepare_read_data_cont(con);
+ ret = 0;
+ break;
+ case IN_S_HANDLE_EPILOGUE:
+ ret = handle_epilogue(con);
+ break;
+ case IN_S_FINISH_SKIP:
+ finish_skip(con);
+ ret = 0;
+ break;
+ default:
+ WARN(1, "bad in_state %d", con->v2.in_state);
+ return -EINVAL;
+ }
+ } else {
+ WARN(1, "bad state %d", con->state);
+ return -EINVAL;
+ }
+ if (ret) {
+ dout("%s con %p error %d\n", __func__, con, ret);
+ return ret;
+ }
+
+ if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+ return -ENODATA;
+ dout("%s con %p populated %zu\n", __func__, con,
+ iov_iter_count(&con->v2.in_iter));
+ return 1;
+}
+
+int ceph_con_v2_try_read(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d need %zu\n", __func__, con, con->state,
+ iov_iter_count(&con->v2.in_iter));
+
+ if (con->state == CEPH_CON_S_PREOPEN)
+ return 0;
+
+ /*
+ * We should always have something pending here. If not,
+ * avoid calling populate_in_iter() as if we read something
+ * (ceph_tcp_recv() would immediately return 1).
+ */
+ if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+ return -ENODATA;
+
+ for (;;) {
+ ret = ceph_tcp_recv(con);
+ if (ret <= 0)
+ return ret;
+
+ ret = populate_in_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "read processing error";
+ return ret;
+ }
+ }
+}
+
+static void queue_data(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ con->v2.out_epil.data_crc = -1;
+ ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg,
+ data_len(con->out_msg));
+
+ get_bvec_at(&con->v2.out_cursor, &bv);
+ set_out_bvec(con, &bv, true);
+ con->v2.out_state = OUT_S_QUEUE_DATA_CONT;
+}
+
+static void queue_data_cont(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ con->v2.out_epil.data_crc = ceph_crc32c_page(
+ con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+ con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len);
+
+ ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len);
+ if (con->v2.out_cursor.total_resid) {
+ get_bvec_at(&con->v2.out_cursor, &bv);
+ set_out_bvec(con, &bv, true);
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT);
+ return;
+ }
+
+ /*
+ * We've written all data. Queue epilogue. Once it's written,
+ * we are done.
+ */
+ reset_out_kvecs(con);
+ prepare_epilogue_plain(con, false);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_enc_page(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i,
+ con->v2.out_enc_resid);
+ WARN_ON(!con->v2.out_enc_resid);
+
+ bv.bv_page = con->v2.out_enc_pages[con->v2.out_enc_i];
+ bv.bv_offset = 0;
+ bv.bv_len = min(con->v2.out_enc_resid, (int)PAGE_SIZE);
+
+ set_out_bvec(con, &bv, false);
+ con->v2.out_enc_i++;
+ con->v2.out_enc_resid -= bv.bv_len;
+
+ if (con->v2.out_enc_resid) {
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE);
+ return;
+ }
+
+ /*
+ * We've queued the last piece of ciphertext (ending with
+ * epilogue) + auth tag. Once it's written, we are done.
+ */
+ WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_zeros(struct ceph_connection *con)
+{
+ dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero);
+
+ if (con->v2.out_zero) {
+ set_out_bvec_zero(con);
+ con->v2.out_zero -= con->v2.out_bvec.bv_len;
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ /*
+ * We've zero-filled everything up to epilogue. Queue epilogue
+ * with late_status set to ABORTED and crcs adjusted for zeros.
+ * Once it's written, we are done patching up for the revoke.
+ */
+ reset_out_kvecs(con);
+ prepare_epilogue_plain(con, true);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void finish_message(struct ceph_connection *con)
+{
+ dout("%s con %p msg %p\n", __func__, con, con->out_msg);
+
+ /* we end up here both plain and secure modes */
+ if (con->v2.out_enc_pages) {
+ WARN_ON(!con->v2.out_enc_page_cnt);
+ ceph_release_page_vector(con->v2.out_enc_pages,
+ con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = NULL;
+ con->v2.out_enc_page_cnt = 0;
+ }
+ /* message may have been revoked */
+ if (con->out_msg) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+
+ con->v2.out_state = OUT_S_GET_NEXT;
+}
+
+static int populate_out_iter(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d out_state %d\n", __func__, con, con->state,
+ con->v2.out_state);
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+
+ if (con->state != CEPH_CON_S_OPEN) {
+ WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX ||
+ con->state > CEPH_CON_S_V2_SESSION_RECONNECT);
+ goto nothing_pending;
+ }
+
+ switch (con->v2.out_state) {
+ case OUT_S_QUEUE_DATA:
+ WARN_ON(!con->out_msg);
+ queue_data(con);
+ goto populated;
+ case OUT_S_QUEUE_DATA_CONT:
+ WARN_ON(!con->out_msg);
+ queue_data_cont(con);
+ goto populated;
+ case OUT_S_QUEUE_ENC_PAGE:
+ queue_enc_page(con);
+ goto populated;
+ case OUT_S_QUEUE_ZEROS:
+ WARN_ON(con->out_msg); /* revoked */
+ queue_zeros(con);
+ goto populated;
+ case OUT_S_FINISH_MESSAGE:
+ finish_message(con);
+ break;
+ case OUT_S_GET_NEXT:
+ break;
+ default:
+ WARN(1, "bad out_state %d", con->v2.out_state);
+ return -EINVAL;
+ }
+
+ WARN_ON(con->v2.out_state != OUT_S_GET_NEXT);
+ if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+ ret = prepare_keepalive2(con);
+ if (ret) {
+ pr_err("prepare_keepalive2 failed: %d\n", ret);
+ return ret;
+ }
+ } else if (!list_empty(&con->out_queue)) {
+ ceph_con_get_out_msg(con);
+ ret = prepare_message(con);
+ if (ret) {
+ pr_err("prepare_message failed: %d\n", ret);
+ return ret;
+ }
+ } else if (con->in_seq > con->in_seq_acked) {
+ ret = prepare_ack(con);
+ if (ret) {
+ pr_err("prepare_ack failed: %d\n", ret);
+ return ret;
+ }
+ } else {
+ goto nothing_pending;
+ }
+
+populated:
+ if (WARN_ON(!iov_iter_count(&con->v2.out_iter)))
+ return -ENODATA;
+ dout("%s con %p populated %zu\n", __func__, con,
+ iov_iter_count(&con->v2.out_iter));
+ return 1;
+
+nothing_pending:
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ dout("%s con %p nothing pending\n", __func__, con);
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+int ceph_con_v2_try_write(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d have %zu\n", __func__, con, con->state,
+ iov_iter_count(&con->v2.out_iter));
+
+ /* open the socket first? */
+ if (con->state == CEPH_CON_S_PREOPEN) {
+ WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2);
+
+ /*
+ * Always bump global_seq. Bump connect_seq only if
+ * there is a session (i.e. we are reconnecting and will
+ * send session_reconnect instead of client_ident).
+ */
+ con->v2.global_seq = ceph_get_global_seq(con->msgr, 0);
+ if (con->v2.server_cookie)
+ con->v2.connect_seq++;
+
+ ret = prepare_read_banner_prefix(con);
+ if (ret) {
+ pr_err("prepare_read_banner_prefix failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+
+ reset_out_kvecs(con);
+ ret = prepare_banner(con);
+ if (ret) {
+ pr_err("prepare_banner failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+
+ ret = ceph_tcp_connect(con);
+ if (ret) {
+ pr_err("ceph_tcp_connect failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+ }
+
+ if (!iov_iter_count(&con->v2.out_iter)) {
+ ret = populate_out_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "write processing error";
+ return ret;
+ }
+ }
+
+ tcp_sock_set_cork(con->sock->sk, true);
+ for (;;) {
+ ret = ceph_tcp_send(con);
+ if (ret <= 0)
+ break;
+
+ ret = populate_out_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "write processing error";
+ break;
+ }
+ }
+
+ tcp_sock_set_cork(con->sock->sk, false);
+ return ret;
+}
+
+static u32 crc32c_zeros(u32 crc, int zero_len)
+{
+ int len;
+
+ while (zero_len) {
+ len = min(zero_len, (int)PAGE_SIZE);
+ crc = crc32c(crc, page_address(ceph_zero_page), len);
+ zero_len -= len;
+ }
+
+ return crc;
+}
+
+static void prepare_zero_front(struct ceph_connection *con, int resid)
+{
+ int sent;
+
+ WARN_ON(!resid || resid > front_len(con->out_msg));
+ sent = front_len(con->out_msg) - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.front_crc =
+ crc32c(-1, con->out_msg->front.iov_base, sent);
+ con->v2.out_epil.front_crc =
+ crc32c_zeros(con->v2.out_epil.front_crc, resid);
+ } else {
+ con->v2.out_epil.front_crc = crc32c_zeros(-1, resid);
+ }
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, resid);
+}
+
+static void prepare_zero_middle(struct ceph_connection *con, int resid)
+{
+ int sent;
+
+ WARN_ON(!resid || resid > middle_len(con->out_msg));
+ sent = middle_len(con->out_msg) - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.middle_crc =
+ crc32c(-1, con->out_msg->middle->vec.iov_base, sent);
+ con->v2.out_epil.middle_crc =
+ crc32c_zeros(con->v2.out_epil.middle_crc, resid);
+ } else {
+ con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid);
+ }
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, resid);
+}
+
+static void prepare_zero_data(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+ con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg));
+ out_zero_add(con, data_len(con->out_msg));
+}
+
+static void revoke_at_queue_data(struct ceph_connection *con)
+{
+ int boundary;
+ int resid;
+
+ WARN_ON(!data_len(con->out_msg));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+
+ boundary = front_len(con->out_msg) + middle_len(con->out_msg);
+ if (resid > boundary) {
+ resid -= boundary;
+ WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head\n", __func__, con);
+ if (front_len(con->out_msg))
+ prepare_zero_front(con, front_len(con->out_msg));
+ if (middle_len(con->out_msg))
+ prepare_zero_middle(con, middle_len(con->out_msg));
+ prepare_zero_data(con);
+ WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ boundary = middle_len(con->out_msg);
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending front\n", __func__, con);
+ prepare_zero_front(con, resid);
+ if (middle_len(con->out_msg))
+ prepare_zero_middle(con, middle_len(con->out_msg));
+ prepare_zero_data(con);
+ queue_zeros(con);
+ return;
+ }
+
+ WARN_ON(!resid);
+ dout("%s con %p was sending middle\n", __func__, con);
+ prepare_zero_middle(con, resid);
+ prepare_zero_data(con);
+ queue_zeros(con);
+}
+
+static void revoke_at_queue_data_cont(struct ceph_connection *con)
+{
+ int sent, resid; /* current piece of data */
+
+ WARN_ON(!data_len(con->out_msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+ WARN_ON(!resid || resid > con->v2.out_bvec.bv_len);
+ sent = con->v2.out_bvec.bv_len - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.data_crc = ceph_crc32c_page(
+ con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+ con->v2.out_bvec.bv_offset, sent);
+ ceph_msg_data_advance(&con->v2.out_cursor, sent);
+ }
+ WARN_ON(resid > con->v2.out_cursor.total_resid);
+ con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc,
+ con->v2.out_cursor.total_resid);
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, con->v2.out_cursor.total_resid);
+ queue_zeros(con);
+}
+
+static void revoke_at_finish_message(struct ceph_connection *con)
+{
+ int boundary;
+ int resid;
+
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+
+ if (!front_len(con->out_msg) && !middle_len(con->out_msg) &&
+ !data_len(con->out_msg)) {
+ WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head (empty message) - noop\n",
+ __func__, con);
+ return;
+ }
+
+ boundary = front_len(con->out_msg) + middle_len(con->out_msg) +
+ CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head\n", __func__, con);
+ if (front_len(con->out_msg))
+ prepare_zero_front(con, front_len(con->out_msg));
+ if (middle_len(con->out_msg))
+ prepare_zero_middle(con, middle_len(con->out_msg));
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending front\n", __func__, con);
+ prepare_zero_front(con, resid);
+ if (middle_len(con->out_msg))
+ prepare_zero_middle(con, middle_len(con->out_msg));
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ queue_zeros(con);
+ return;
+ }
+
+ boundary = CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending middle\n", __func__, con);
+ prepare_zero_middle(con, resid);
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ queue_zeros(con);
+ return;
+ }
+
+ WARN_ON(!resid);
+ dout("%s con %p was sending epilogue - noop\n", __func__, con);
+}
+
+void ceph_con_v2_revoke(struct ceph_connection *con)
+{
+ WARN_ON(con->v2.out_zero);
+
+ if (con_secure(con)) {
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE &&
+ con->v2.out_state != OUT_S_FINISH_MESSAGE);
+ dout("%s con %p secure - noop\n", __func__, con);
+ return;
+ }
+
+ switch (con->v2.out_state) {
+ case OUT_S_QUEUE_DATA:
+ revoke_at_queue_data(con);
+ break;
+ case OUT_S_QUEUE_DATA_CONT:
+ revoke_at_queue_data_cont(con);
+ break;
+ case OUT_S_FINISH_MESSAGE:
+ revoke_at_finish_message(con);
+ break;
+ default:
+ WARN(1, "bad out_state %d", con->v2.out_state);
+ break;
+ }
+}
+
+static void revoke_at_prepare_read_data(struct ceph_connection *con)
+{
+ int remaining; /* data + [data padding] + epilogue */
+ int resid;
+
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid);
+
+ if (con_secure(con))
+ remaining = padded_len(data_len(con->in_msg)) +
+ CEPH_EPILOGUE_SECURE_LEN;
+ else
+ remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+
+ dout("%s con %p resid %d remaining %d\n", __func__, con, resid,
+ remaining);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
+{
+ int recved, resid; /* current piece of data */
+ int remaining; /* [data padding] + epilogue */
+
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+ recved = con->v2.in_bvec.bv_len - resid;
+ dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid);
+
+ if (recved)
+ ceph_msg_data_advance(&con->v2.in_cursor, recved);
+ WARN_ON(resid > con->v2.in_cursor.total_resid);
+
+ if (con_secure(con))
+ remaining = padding_len(data_len(con->in_msg)) +
+ CEPH_EPILOGUE_SECURE_LEN;
+ else
+ remaining = CEPH_EPILOGUE_PLAIN_LEN;
+
+ dout("%s con %p total_resid %zu remaining %d\n", __func__, con,
+ con->v2.in_cursor.total_resid, remaining);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, con->v2.in_cursor.total_resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_handle_epilogue(struct ceph_connection *con)
+{
+ int resid;
+
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid);
+
+ dout("%s con %p resid %d\n", __func__, con, resid);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
+{
+ switch (con->v2.in_state) {
+ case IN_S_PREPARE_READ_DATA:
+ revoke_at_prepare_read_data(con);
+ break;
+ case IN_S_PREPARE_READ_DATA_CONT:
+ revoke_at_prepare_read_data_cont(con);
+ break;
+ case IN_S_HANDLE_EPILOGUE:
+ revoke_at_handle_epilogue(con);
+ break;
+ default:
+ WARN(1, "bad in_state %d", con->v2.in_state);
+ break;
+ }
+}
+
+bool ceph_con_v2_opened(struct ceph_connection *con)
+{
+ return con->v2.peer_global_seq;
+}
+
+void ceph_con_v2_reset_session(struct ceph_connection *con)
+{
+ con->v2.client_cookie = 0;
+ con->v2.server_cookie = 0;
+ con->v2.global_seq = 0;
+ con->v2.connect_seq = 0;
+ con->v2.peer_global_seq = 0;
+}
+
+void ceph_con_v2_reset_protocol(struct ceph_connection *con)
+{
+ iov_iter_truncate(&con->v2.in_iter, 0);
+ iov_iter_truncate(&con->v2.out_iter, 0);
+ con->v2.out_zero = 0;
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+
+ if (con->v2.out_enc_pages) {
+ WARN_ON(!con->v2.out_enc_page_cnt);
+ ceph_release_page_vector(con->v2.out_enc_pages,
+ con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = NULL;
+ con->v2.out_enc_page_cnt = 0;
+ }
+
+ con->v2.con_mode = CEPH_CON_MODE_UNKNOWN;
+ memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN);
+ memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN);
+
+ if (con->v2.hmac_tfm) {
+ crypto_free_shash(con->v2.hmac_tfm);
+ con->v2.hmac_tfm = NULL;
+ }
+ if (con->v2.gcm_req) {
+ aead_request_free(con->v2.gcm_req);
+ con->v2.gcm_req = NULL;
+ }
+ if (con->v2.gcm_tfm) {
+ crypto_free_aead(con->v2.gcm_tfm);
+ con->v2.gcm_tfm = NULL;
+ }
+}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index d633a0aeaa55..195ceb8afb06 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops;
static int __validate_auth(struct ceph_mon_client *monc);
+static int decode_mon_info(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr)
+{
+ void *mon_info_end;
+ u32 struct_len;
+ u8 struct_v;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v,
+ &struct_len);
+ if (ret)
+ return ret;
+
+ mon_info_end = *p + struct_len;
+ ceph_decode_skip_string(p, end, e_inval); /* skip mon name */
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+ if (ret)
+ return ret;
+
+ *p = mon_info_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
/*
* Decode a monmap blob (e.g., during mount).
+ *
+ * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC).
*/
-static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2)
{
- struct ceph_monmap *m = NULL;
- int i, err = -EINVAL;
+ struct ceph_monmap *monmap = NULL;
struct ceph_fsid fsid;
- u32 epoch, num_mon;
- u32 len;
+ u32 struct_len;
+ int blob_len;
+ int num_mon;
+ u8 struct_v;
+ u32 epoch;
+ int ret;
+ int i;
+
+ ceph_decode_32_safe(p, end, blob_len, e_inval);
+ ceph_decode_need(p, end, blob_len, e_inval);
+
+ ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len);
+ if (ret)
+ goto fail;
- ceph_decode_32_safe(&p, end, len, bad);
- ceph_decode_need(&p, end, len, bad);
+ dout("%s struct_v %d\n", __func__, struct_v);
+ ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval);
+ ceph_decode_32_safe(p, end, epoch, e_inval);
+ if (struct_v >= 6) {
+ u32 feat_struct_len;
+ u8 feat_struct_v;
- dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
- p += sizeof(u16); /* skip version */
+ *p += sizeof(struct ceph_timespec); /* skip last_changed */
+ *p += sizeof(struct ceph_timespec); /* skip created */
- ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
- ceph_decode_copy(&p, &fsid, sizeof(fsid));
- epoch = ceph_decode_32(&p);
+ ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+ &feat_struct_v, &feat_struct_len);
+ if (ret)
+ goto fail;
- num_mon = ceph_decode_32(&p);
+ *p += feat_struct_len; /* skip persistent_features */
+ ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+ &feat_struct_v, &feat_struct_len);
+ if (ret)
+ goto fail;
+
+ *p += feat_struct_len; /* skip optional_features */
+ }
+ ceph_decode_32_safe(p, end, num_mon, e_inval);
+
+ dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch,
+ num_mon);
if (num_mon > CEPH_MAX_MON)
- goto bad;
- m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS);
- if (m == NULL)
- return ERR_PTR(-ENOMEM);
- m->fsid = fsid;
- m->epoch = epoch;
- m->num_mon = num_mon;
- for (i = 0; i < num_mon; ++i) {
- struct ceph_entity_inst *inst = &m->mon_inst[i];
-
- /* copy name portion */
- ceph_decode_copy_safe(&p, end, &inst->name,
- sizeof(inst->name), bad);
- err = ceph_decode_entity_addr(&p, end, &inst->addr);
- if (err)
- goto bad;
+ goto e_inval;
+
+ monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO);
+ if (!monmap) {
+ ret = -ENOMEM;
+ goto fail;
}
- dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
- m->num_mon);
- for (i = 0; i < m->num_mon; i++)
- dout("monmap_decode mon%d is %s\n", i,
- ceph_pr_addr(&m->mon_inst[i].addr));
- return m;
-bad:
- dout("monmap_decode failed with %d\n", err);
- kfree(m);
- return ERR_PTR(err);
+ monmap->fsid = fsid;
+ monmap->epoch = epoch;
+ monmap->num_mon = num_mon;
+
+ /* legacy_mon_addr map or mon_info map */
+ for (i = 0; i < num_mon; i++) {
+ struct ceph_entity_inst *inst = &monmap->mon_inst[i];
+
+ ceph_decode_skip_string(p, end, e_inval); /* skip mon name */
+ inst->name.type = CEPH_ENTITY_TYPE_MON;
+ inst->name.num = cpu_to_le64(i);
+
+ if (struct_v >= 6)
+ ret = decode_mon_info(p, end, msgr2, &inst->addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &inst->addr);
+ if (ret)
+ goto fail;
+
+ dout("%s mon%d addr %s\n", __func__, i,
+ ceph_pr_addr(&inst->addr));
+ }
+
+ return monmap;
+
+e_inval:
+ ret = -EINVAL;
+fail:
+ kfree(monmap);
+ return ERR_PTR(ret);
}
/*
@@ -96,9 +161,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
{
int i;
- for (i = 0; i < m->num_mon; i++)
- if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+ for (i = 0; i < m->num_mon; i++) {
+ if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr))
return 1;
+ }
+
return 0;
}
@@ -190,10 +257,16 @@ static void __open_session(struct ceph_mon_client *monc)
&monc->monmap->mon_inst[monc->cur_mon].addr);
/*
- * send an initial keepalive to ensure our timestamp is valid
- * by the time we are in an OPENED state
+ * Queue a keepalive to ensure that in case of an early fault
+ * the messenger doesn't put us into STANDBY state and instead
+ * retries. This also ensures that our timestamp is valid by
+ * the time we finish hunting and delayed_work() checks it.
*/
ceph_con_keepalive(&monc->con);
+ if (ceph_msgr2(monc->client)) {
+ monc->pending_auth = 1;
+ return;
+ }
/* initiate authentication handshake */
ret = ceph_auth_build_hello(monc->auth,
@@ -476,7 +549,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
p = msg->front.iov_base;
end = p + msg->front.iov_len;
- monmap = ceph_monmap_decode(p, end);
+ monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client));
if (IS_ERR(monmap)) {
pr_err("problem decoding monmap, %d\n",
(int)PTR_ERR(monmap));
@@ -896,8 +969,9 @@ bad:
ceph_msg_dump(msg);
}
-int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
- struct ceph_entity_addr *client_addr)
+static __printf(2, 0)
+int do_mon_command_vargs(struct ceph_mon_client *monc, const char *fmt,
+ va_list ap)
{
struct ceph_mon_generic_request *req;
struct ceph_mon_command *h;
@@ -925,29 +999,65 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid;
h->num_strs = cpu_to_le32(1);
- len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \
- \"blacklistop\": \"add\", \
- \"addr\": \"%pISpc/%u\" }",
- &client_addr->in_addr, le32_to_cpu(client_addr->nonce));
+ len = vsprintf(h->str, fmt, ap);
h->str_len = cpu_to_le32(len);
send_generic_request(monc, req);
mutex_unlock(&monc->mutex);
ret = wait_generic_request(req);
- if (!ret)
- /*
- * Make sure we have the osdmap that includes the blacklist
- * entry. This is needed to ensure that the OSDs pick up the
- * new blacklist before processing any future requests from
- * this client.
- */
- ret = ceph_wait_for_latest_osdmap(monc->client, 0);
-
out:
put_generic_request(req);
return ret;
}
-EXPORT_SYMBOL(ceph_monc_blacklist_add);
+
+static __printf(2, 3)
+int do_mon_command(struct ceph_mon_client *monc, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = do_mon_command_vargs(monc, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
+ struct ceph_entity_addr *client_addr)
+{
+ int ret;
+
+ ret = do_mon_command(monc,
+ "{ \"prefix\": \"osd blocklist\", \
+ \"blocklistop\": \"add\", \
+ \"addr\": \"%pISpc/%u\" }",
+ &client_addr->in_addr,
+ le32_to_cpu(client_addr->nonce));
+ if (ret == -EINVAL) {
+ /*
+ * The monitor returns EINVAL on an unrecognized command.
+ * Try the legacy command -- it is exactly the same except
+ * for the name.
+ */
+ ret = do_mon_command(monc,
+ "{ \"prefix\": \"osd blacklist\", \
+ \"blacklistop\": \"add\", \
+ \"addr\": \"%pISpc/%u\" }",
+ &client_addr->in_addr,
+ le32_to_cpu(client_addr->nonce));
+ }
+ if (ret)
+ return ret;
+
+ /*
+ * Make sure we have the osdmap that includes the blocklist
+ * entry. This is needed to ensure that the OSDs pick up the
+ * new blocklist before processing any future requests from
+ * this client.
+ */
+ return ceph_wait_for_latest_osdmap(monc->client, 0);
+}
+EXPORT_SYMBOL(ceph_monc_blocklist_add);
/*
* Resend pending generic requests.
@@ -1015,8 +1125,9 @@ static void delayed_work(struct work_struct *work)
*/
static int build_initial_monmap(struct ceph_mon_client *monc)
{
+ __le32 my_type = ceph_msgr2(monc->client) ?
+ CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY;
struct ceph_options *opt = monc->client->options;
- struct ceph_entity_addr *mon_addr = opt->mon_addr;
int num_mon = opt->num_mon;
int i;
@@ -1025,12 +1136,16 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
GFP_KERNEL);
if (!monc->monmap)
return -ENOMEM;
+
for (i = 0; i < num_mon; i++) {
- monc->monmap->mon_inst[i].addr = mon_addr[i];
- monc->monmap->mon_inst[i].addr.nonce = 0;
- monc->monmap->mon_inst[i].name.type =
- CEPH_ENTITY_TYPE_MON;
- monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+ struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
+
+ memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr,
+ sizeof(inst->addr.in_addr));
+ inst->addr.type = my_type;
+ inst->addr.nonce = 0;
+ inst->name.type = CEPH_ENTITY_TYPE_MON;
+ inst->name.num = cpu_to_le64(i);
}
monc->monmap->num_mon = num_mon;
return 0;
@@ -1052,8 +1167,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
/* connection */
/* authentication */
- monc->auth = ceph_auth_init(cl->options->name,
- cl->options->key);
+ monc->auth = ceph_auth_init(cl->options->name, cl->options->key,
+ cl->options->con_modes);
if (IS_ERR(monc->auth)) {
err = PTR_ERR(monc->auth);
goto out_monmap;
@@ -1157,30 +1272,22 @@ static void finish_hunting(struct ceph_mon_client *monc)
}
}
-static void handle_auth_reply(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
+static void finish_auth(struct ceph_mon_client *monc, int auth_err,
+ bool was_authed)
{
- int ret;
- int was_auth = 0;
+ dout("%s auth_err %d was_authed %d\n", __func__, auth_err, was_authed);
+ WARN_ON(auth_err > 0);
- mutex_lock(&monc->mutex);
- was_auth = ceph_auth_is_authenticated(monc->auth);
monc->pending_auth = 0;
- ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
- msg->front.iov_len,
- monc->m_auth->front.iov_base,
- monc->m_auth->front_alloc_len);
- if (ret > 0) {
- __send_prepared_auth_request(monc, ret);
- goto out;
+ if (auth_err) {
+ monc->client->auth_err = auth_err;
+ wake_up_all(&monc->client->auth_wq);
+ return;
}
- finish_hunting(monc);
-
- if (ret < 0) {
- monc->client->auth_err = ret;
- } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
- dout("authenticated, starting session\n");
+ if (!was_authed && ceph_auth_is_authenticated(monc->auth)) {
+ dout("%s authenticated, starting session global_id %llu\n",
+ __func__, monc->auth->global_id);
monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
monc->client->msgr.inst.name.num =
@@ -1192,11 +1299,27 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
pr_info("mon%d %s session established\n", monc->cur_mon,
ceph_pr_addr(&monc->con.peer_addr));
}
+}
-out:
+static void handle_auth_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ bool was_authed;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+ msg->front.iov_len,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_alloc_len);
+ if (ret > 0) {
+ __send_prepared_auth_request(monc, ret);
+ } else {
+ finish_auth(monc, ret, was_authed);
+ finish_hunting(monc);
+ }
mutex_unlock(&monc->mutex);
- if (monc->client->auth_err < 0)
- wake_up_all(&monc->client->auth_wq);
}
static int __validate_auth(struct ceph_mon_client *monc)
@@ -1225,10 +1348,92 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
}
EXPORT_SYMBOL(ceph_monc_validate_auth);
+static int mon_get_auth_request(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = ceph_auth_get_request(monc->auth, buf, *buf_len);
+ mutex_unlock(&monc->mutex);
+ if (ret < 0)
+ return ret;
+
+ *buf_len = ret;
+ *authorizer = NULL;
+ *authorizer_len = 0;
+ return 0;
+}
+
+static int mon_handle_auth_reply_more(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len,
+ buf, *buf_len);
+ mutex_unlock(&monc->mutex);
+ if (ret < 0)
+ return ret;
+
+ *buf_len = ret;
+ *authorizer = NULL;
+ *authorizer_len = 0;
+ return 0;
+}
+
+static int mon_handle_auth_done(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ bool was_authed;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ WARN_ON(!monc->hunting);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ret = ceph_auth_handle_reply_done(monc->auth, global_id,
+ reply, reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+ finish_auth(monc, ret, was_authed);
+ if (!ret)
+ finish_hunting(monc);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+static int mon_handle_auth_bad_method(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ struct ceph_mon_client *monc = con->private;
+ bool was_authed;
+
+ mutex_lock(&monc->mutex);
+ WARN_ON(!monc->hunting);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ceph_auth_handle_bad_method(monc->auth, used_proto, result,
+ allowed_protos, proto_cnt,
+ allowed_modes, mode_cnt);
+ finish_auth(monc, -EACCES, was_authed);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
/*
* handle incoming message
*/
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+static void mon_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_mon_client *monc = con->private;
int type = le16_to_cpu(msg->hdr.type);
@@ -1360,19 +1565,23 @@ static void mon_fault(struct ceph_connection *con)
* will come from the messenger workqueue, which is drained prior to
* mon_client destruction.
*/
-static struct ceph_connection *con_get(struct ceph_connection *con)
+static struct ceph_connection *mon_get_con(struct ceph_connection *con)
{
return con;
}
-static void con_put(struct ceph_connection *con)
+static void mon_put_con(struct ceph_connection *con)
{
}
static const struct ceph_connection_operations mon_con_ops = {
- .get = con_get,
- .put = con_put,
- .dispatch = dispatch,
- .fault = mon_fault,
+ .get = mon_get_con,
+ .put = mon_put_con,
.alloc_msg = mon_alloc_msg,
+ .dispatch = mon_dispatch,
+ .fault = mon_fault,
+ .get_auth_request = mon_get_auth_request,
+ .handle_auth_reply_more = mon_handle_auth_reply_more,
+ .handle_auth_done = mon_handle_auth_done,
+ .handle_auth_bad_method = mon_handle_auth_bad_method,
};
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7901ab6c79fd..ff8624a7c964 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3918,9 +3918,11 @@ static int handle_one_map(struct ceph_osd_client *osdc,
set_pool_was_full(osdc);
if (incremental)
- newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+ newmap = osdmap_apply_incremental(&p, end,
+ ceph_msgr2(osdc->client),
+ osdc->osdmap);
else
- newmap = ceph_osdmap_decode(&p, end);
+ newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client));
if (IS_ERR(newmap))
return PTR_ERR(newmap);
@@ -5410,7 +5412,7 @@ void ceph_osdc_cleanup(void)
/*
* handle incoming message
*/
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_osd *osd = con->private;
struct ceph_osd_client *osdc = osd->o_osdc;
@@ -5532,9 +5534,9 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
return m;
}
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip)
+static struct ceph_msg *osd_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
{
struct ceph_osd *osd = con->private;
int type = le16_to_cpu(hdr->type);
@@ -5558,7 +5560,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
/*
* Wrappers to refcount containing ceph_osd struct
*/
-static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+static struct ceph_connection *osd_get_con(struct ceph_connection *con)
{
struct ceph_osd *osd = con->private;
if (get_osd(osd))
@@ -5566,7 +5568,7 @@ static struct ceph_connection *get_osd_con(struct ceph_connection *con)
return NULL;
}
-static void put_osd_con(struct ceph_connection *con)
+static void osd_put_con(struct ceph_connection *con)
{
struct ceph_osd *osd = con->private;
put_osd(osd);
@@ -5575,39 +5577,29 @@ static void put_osd_con(struct ceph_connection *con)
/*
* authentication
*/
+
/*
* Note: returned pointer is the address of a structure that's
* managed separately. Caller must *not* attempt to free it.
*/
-static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
- int *proto, int force_new)
+static struct ceph_auth_handshake *
+osd_get_authorizer(struct ceph_connection *con, int *proto, int force_new)
{
struct ceph_osd *o = con->private;
struct ceph_osd_client *osdc = o->o_osdc;
struct ceph_auth_client *ac = osdc->client->monc.auth;
struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
- if (force_new && auth->authorizer) {
- ceph_auth_destroy_authorizer(auth->authorizer);
- auth->authorizer = NULL;
- }
- if (!auth->authorizer) {
- int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
- auth);
- if (ret)
- return ERR_PTR(ret);
- } else {
- int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
- auth);
- if (ret)
- return ERR_PTR(ret);
- }
- *proto = ac->protocol;
+ ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+ force_new, proto, NULL, NULL);
+ if (ret)
+ return ERR_PTR(ret);
return auth;
}
-static int add_authorizer_challenge(struct ceph_connection *con,
+static int osd_add_authorizer_challenge(struct ceph_connection *con,
void *challenge_buf, int challenge_buf_len)
{
struct ceph_osd *o = con->private;
@@ -5618,16 +5610,19 @@ static int add_authorizer_challenge(struct ceph_connection *con,
challenge_buf, challenge_buf_len);
}
-static int verify_authorizer_reply(struct ceph_connection *con)
+static int osd_verify_authorizer_reply(struct ceph_connection *con)
{
struct ceph_osd *o = con->private;
struct ceph_osd_client *osdc = o->o_osdc;
struct ceph_auth_client *ac = osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
- return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+ NULL, NULL, NULL, NULL);
}
-static int invalidate_authorizer(struct ceph_connection *con)
+static int osd_invalidate_authorizer(struct ceph_connection *con)
{
struct ceph_osd *o = con->private;
struct ceph_osd_client *osdc = o->o_osdc;
@@ -5637,6 +5632,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&osdc->client->monc);
}
+static int osd_get_auth_request(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
+
+ ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int osd_handle_auth_reply_more(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
+
+ ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int osd_handle_auth_done(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+
+ return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+
+static int osd_handle_auth_bad_method(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_mon_client *monc = &o->o_osdc->client->monc;
+ int ret;
+
+ if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD,
+ used_proto, result,
+ allowed_protos, proto_cnt,
+ allowed_modes, mode_cnt)) {
+ ret = ceph_monc_validate_auth(monc);
+ if (ret)
+ return ret;
+ }
+
+ return -EACCES;
+}
+
static void osd_reencode_message(struct ceph_msg *msg)
{
int type = le16_to_cpu(msg->hdr.type);
@@ -5662,16 +5731,20 @@ static int osd_check_message_signature(struct ceph_msg *msg)
}
static const struct ceph_connection_operations osd_con_ops = {
- .get = get_osd_con,
- .put = put_osd_con,
- .dispatch = dispatch,
- .get_authorizer = get_authorizer,
- .add_authorizer_challenge = add_authorizer_challenge,
- .verify_authorizer_reply = verify_authorizer_reply,
- .invalidate_authorizer = invalidate_authorizer,
- .alloc_msg = alloc_msg,
+ .get = osd_get_con,
+ .put = osd_put_con,
+ .alloc_msg = osd_alloc_msg,
+ .dispatch = osd_dispatch,
+ .fault = osd_fault,
.reencode_message = osd_reencode_message,
+ .get_authorizer = osd_get_authorizer,
+ .add_authorizer_challenge = osd_add_authorizer_challenge,
+ .verify_authorizer_reply = osd_verify_authorizer_reply,
+ .invalidate_authorizer = osd_invalidate_authorizer,
.sign_message = osd_sign_message,
.check_message_signature = osd_check_message_signature,
- .fault = osd_fault,
+ .get_auth_request = osd_get_auth_request,
+ .handle_auth_reply_more = osd_handle_auth_reply_more,
+ .handle_auth_done = osd_handle_auth_done,
+ .handle_auth_bad_method = osd_handle_auth_bad_method,
};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 96c25f5e064a..2b1dd252f231 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -965,6 +965,143 @@ bad:
}
/*
+ * CRUSH workspaces
+ *
+ * workspace_manager framework borrowed from fs/btrfs/compression.c.
+ * Two simplifications: there is only one type of workspace and there
+ * is always at least one workspace.
+ */
+static struct crush_work *alloc_workspace(const struct crush_map *c)
+{
+ struct crush_work *work;
+ size_t work_size;
+
+ WARN_ON(!c->working_size);
+ work_size = crush_work_size(c, CEPH_PG_MAX_SIZE);
+ dout("%s work_size %zu bytes\n", __func__, work_size);
+
+ work = ceph_kvmalloc(work_size, GFP_NOIO);
+ if (!work)
+ return NULL;
+
+ INIT_LIST_HEAD(&work->item);
+ crush_init_workspace(c, work);
+ return work;
+}
+
+static void free_workspace(struct crush_work *work)
+{
+ WARN_ON(!list_empty(&work->item));
+ kvfree(work);
+}
+
+static void init_workspace_manager(struct workspace_manager *wsm)
+{
+ INIT_LIST_HEAD(&wsm->idle_ws);
+ spin_lock_init(&wsm->ws_lock);
+ atomic_set(&wsm->total_ws, 0);
+ wsm->free_ws = 0;
+ init_waitqueue_head(&wsm->ws_wait);
+}
+
+static void add_initial_workspace(struct workspace_manager *wsm,
+ struct crush_work *work)
+{
+ WARN_ON(!list_empty(&wsm->idle_ws));
+
+ list_add(&work->item, &wsm->idle_ws);
+ atomic_set(&wsm->total_ws, 1);
+ wsm->free_ws = 1;
+}
+
+static void cleanup_workspace_manager(struct workspace_manager *wsm)
+{
+ struct crush_work *work;
+
+ while (!list_empty(&wsm->idle_ws)) {
+ work = list_first_entry(&wsm->idle_ws, struct crush_work,
+ item);
+ list_del_init(&work->item);
+ free_workspace(work);
+ }
+ atomic_set(&wsm->total_ws, 0);
+ wsm->free_ws = 0;
+}
+
+/*
+ * Finds an available workspace or allocates a new one. If it's not
+ * possible to allocate a new one, waits until there is one.
+ */
+static struct crush_work *get_workspace(struct workspace_manager *wsm,
+ const struct crush_map *c)
+{
+ struct crush_work *work;
+ int cpus = num_online_cpus();
+
+again:
+ spin_lock(&wsm->ws_lock);
+ if (!list_empty(&wsm->idle_ws)) {
+ work = list_first_entry(&wsm->idle_ws, struct crush_work,
+ item);
+ list_del_init(&work->item);
+ wsm->free_ws--;
+ spin_unlock(&wsm->ws_lock);
+ return work;
+
+ }
+ if (atomic_read(&wsm->total_ws) > cpus) {
+ DEFINE_WAIT(wait);
+
+ spin_unlock(&wsm->ws_lock);
+ prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws)
+ schedule();
+ finish_wait(&wsm->ws_wait, &wait);
+ goto again;
+ }
+ atomic_inc(&wsm->total_ws);
+ spin_unlock(&wsm->ws_lock);
+
+ work = alloc_workspace(c);
+ if (!work) {
+ atomic_dec(&wsm->total_ws);
+ wake_up(&wsm->ws_wait);
+
+ /*
+ * Do not return the error but go back to waiting. We
+ * have the inital workspace and the CRUSH computation
+ * time is bounded so we will get it eventually.
+ */
+ WARN_ON(atomic_read(&wsm->total_ws) < 1);
+ goto again;
+ }
+ return work;
+}
+
+/*
+ * Puts a workspace back on the list or frees it if we have enough
+ * idle ones sitting around.
+ */
+static void put_workspace(struct workspace_manager *wsm,
+ struct crush_work *work)
+{
+ spin_lock(&wsm->ws_lock);
+ if (wsm->free_ws <= num_online_cpus()) {
+ list_add(&work->item, &wsm->idle_ws);
+ wsm->free_ws++;
+ spin_unlock(&wsm->ws_lock);
+ goto wake;
+ }
+ spin_unlock(&wsm->ws_lock);
+
+ free_workspace(work);
+ atomic_dec(&wsm->total_ws);
+wake:
+ if (wq_has_sleeper(&wsm->ws_wait))
+ wake_up(&wsm->ws_wait);
+}
+
+/*
* osd map
*/
struct ceph_osdmap *ceph_osdmap_alloc(void)
@@ -981,7 +1118,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
map->primary_temp = RB_ROOT;
map->pg_upmap = RB_ROOT;
map->pg_upmap_items = RB_ROOT;
- mutex_init(&map->crush_workspace_mutex);
+
+ init_workspace_manager(&map->crush_wsm);
return map;
}
@@ -989,8 +1127,11 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
void ceph_osdmap_destroy(struct ceph_osdmap *map)
{
dout("osdmap_destroy %p\n", map);
+
if (map->crush)
crush_destroy(map->crush);
+ cleanup_workspace_manager(&map->crush_wsm);
+
while (!RB_EMPTY_ROOT(&map->pg_temp)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_temp),
@@ -1029,7 +1170,6 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
kvfree(map->osd_weight);
kvfree(map->osd_addr);
kvfree(map->osd_primary_affinity);
- kvfree(map->crush_workspace);
kfree(map);
}
@@ -1104,26 +1244,22 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
{
- void *workspace;
- size_t work_size;
+ struct crush_work *work;
if (IS_ERR(crush))
return PTR_ERR(crush);
- work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
- dout("%s work_size %zu bytes\n", __func__, work_size);
- workspace = ceph_kvmalloc(work_size, GFP_NOIO);
- if (!workspace) {
+ work = alloc_workspace(crush);
+ if (!work) {
crush_destroy(crush);
return -ENOMEM;
}
- crush_init_workspace(crush, workspace);
if (map->crush)
crush_destroy(map->crush);
- kvfree(map->crush_workspace);
+ cleanup_workspace_manager(&map->crush_wsm);
map->crush = crush;
- map->crush_workspace = workspace;
+ add_initial_workspace(&map->crush_wsm, work);
return 0;
}
@@ -1511,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end,
/*
* decode a full map.
*/
-static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+static int osdmap_decode(void **p, void *end, bool msgr2,
+ struct ceph_osdmap *map)
{
u8 struct_v;
u32 epoch = 0;
@@ -1582,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
goto e_inval;
for (i = 0; i < map->max_osd; i++) {
- err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
+ struct ceph_entity_addr *addr = &map->osd_addr[i];
+
+ if (struct_v >= 8)
+ err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+ else
+ err = ceph_decode_entity_addr(p, end, addr);
if (err)
goto bad;
+
+ dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
}
/* pg_temp */
@@ -1654,7 +1798,7 @@ bad:
/*
* Allocate and decode a full map.
*/
-struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
{
struct ceph_osdmap *map;
int ret;
@@ -1663,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
if (!map)
return ERR_PTR(-ENOMEM);
- ret = osdmap_decode(p, end, map);
+ ret = osdmap_decode(p, end, msgr2, map);
if (ret) {
ceph_osdmap_destroy(map);
return ERR_PTR(ret);
@@ -1681,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
* new_state: { osd=6, xorstate=EXISTS } # clear osd_state
*/
static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
- struct ceph_osdmap *map)
+ bool msgr2, struct ceph_osdmap *map)
{
void *new_up_client;
void *new_state;
void *new_weight_end;
u32 len;
+ int ret;
int i;
new_up_client = *p;
@@ -1695,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
struct ceph_entity_addr addr;
ceph_decode_skip_32(p, end, e_inval);
- if (ceph_decode_entity_addr(p, end, &addr))
- goto e_inval;
+ if (struct_v >= 7)
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &addr);
+ if (ret)
+ return ret;
}
new_state = *p;
@@ -1738,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
while (len--) {
s32 osd;
u32 xorstate;
- int ret;
osd = ceph_decode_32(p);
if (struct_v >= 5)
@@ -1774,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
osd = ceph_decode_32(p);
BUG_ON(osd >= map->max_osd);
- if (ceph_decode_entity_addr(p, end, &addr))
- goto e_inval;
+ if (struct_v >= 7)
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &addr);
+ if (ret)
+ return ret;
+
+ dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
+
pr_info("osd%d up\n", osd);
map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
map->osd_addr[osd] = addr;
@@ -1791,7 +1946,7 @@ e_inval:
/*
* decode and apply an incremental map update.
*/
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
struct ceph_osdmap *map)
{
struct ceph_fsid fsid;
@@ -1826,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
if (len > 0) {
dout("apply_incremental full map len %d, %p to %p\n",
len, *p, end);
- return ceph_osdmap_decode(p, min(*p+len, end));
+ return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
}
/* new crush? */
@@ -1878,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
}
/* new_up_client, new_state, new_weight */
- err = decode_new_up_state_weight(p, end, struct_v, map);
+ err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
if (err)
goto bad;
@@ -2322,6 +2477,7 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
s64 choose_args_index)
{
struct crush_choose_arg_map *arg_map;
+ struct crush_work *work;
int r;
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
@@ -2332,12 +2488,11 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
CEPH_DEFAULT_CHOOSE_ARGS);
- mutex_lock(&map->crush_workspace_mutex);
+ work = get_workspace(&map->crush_wsm, map->crush);
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
- weight, weight_max, map->crush_workspace,
+ weight, weight_max, work,
arg_map ? arg_map->args : NULL);
- mutex_unlock(&map->crush_workspace_mutex);
-
+ put_workspace(&map->crush_wsm, work);
return r;
}
diff --git a/net/compat.c b/net/compat.c
index 95ce707a30a3..ddd15af3a283 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -98,8 +98,8 @@ int get_compat_msghdr(struct msghdr *kmsg,
if (err)
return err;
- err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr),
- len, UIO_FASTIOV, iov, &kmsg->msg_iter);
+ err = import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr), len,
+ UIO_FASTIOV, iov, &kmsg->msg_iter);
return err < 0 ? err : 0;
}
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index b988f48153a4..4edd033e899c 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -6,550 +6,48 @@
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
#include <net/bpf_sk_storage.h>
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
-#define SK_STORAGE_CREATE_FLAG_MASK \
- (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+DEFINE_BPF_STORAGE_CACHE(sk_cache);
-struct bucket {
- struct hlist_head list;
- raw_spinlock_t lock;
-};
-
-/* Thp map is not the primary owner of a bpf_sk_storage_elem.
- * Instead, the sk->sk_bpf_storage is.
- *
- * The map (bpf_sk_storage_map) is for two purposes
- * 1. Define the size of the "sk local storage". It is
- * the map's value_size.
- *
- * 2. Maintain a list to keep track of all elems such
- * that they can be cleaned up during the map destruction.
- *
- * When a bpf local storage is being looked up for a
- * particular sk, the "bpf_map" pointer is actually used
- * as the "key" to search in the list of elem in
- * sk->sk_bpf_storage.
- *
- * Hence, consider sk->sk_bpf_storage is the mini-map
- * with the "bpf_map" pointer as the searching key.
- */
-struct bpf_sk_storage_map {
- struct bpf_map map;
- /* Lookup elem does not require accessing the map.
- *
- * Updating/Deleting requires a bucket lock to
- * link/unlink the elem from the map. Having
- * multiple buckets to improve contention.
- */
- struct bucket *buckets;
- u32 bucket_log;
- u16 elem_size;
- u16 cache_idx;
-};
-
-struct bpf_sk_storage_data {
- /* smap is used as the searching key when looking up
- * from sk->sk_bpf_storage.
- *
- * Put it in the same cacheline as the data to minimize
- * the number of cachelines access during the cache hit case.
- */
- struct bpf_sk_storage_map __rcu *smap;
- u8 data[] __aligned(8);
-};
-
-/* Linked to bpf_sk_storage and bpf_sk_storage_map */
-struct bpf_sk_storage_elem {
- struct hlist_node map_node; /* Linked to bpf_sk_storage_map */
- struct hlist_node snode; /* Linked to bpf_sk_storage */
- struct bpf_sk_storage __rcu *sk_storage;
- struct rcu_head rcu;
- /* 8 bytes hole */
- /* The data is stored in aother cacheline to minimize
- * the number of cachelines access during a cache hit.
- */
- struct bpf_sk_storage_data sdata ____cacheline_aligned;
-};
-
-#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata)
-#define SDATA(_SELEM) (&(_SELEM)->sdata)
-#define BPF_SK_STORAGE_CACHE_SIZE 16
-
-static DEFINE_SPINLOCK(cache_idx_lock);
-static u64 cache_idx_usage_counts[BPF_SK_STORAGE_CACHE_SIZE];
-
-struct bpf_sk_storage {
- struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE];
- struct hlist_head list; /* List of bpf_sk_storage_elem */
- struct sock *sk; /* The sk that owns the the above "list" of
- * bpf_sk_storage_elem.
- */
- struct rcu_head rcu;
- raw_spinlock_t lock; /* Protect adding/removing from the "list" */
-};
-
-static struct bucket *select_bucket(struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *selem)
+static struct bpf_local_storage_data *
+bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
{
- return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
-}
-
-static int omem_charge(struct sock *sk, unsigned int size)
-{
- /* same check as in sock_kmalloc() */
- if (size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
- atomic_add(size, &sk->sk_omem_alloc);
- return 0;
- }
-
- return -ENOMEM;
-}
-
-static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem)
-{
- return !hlist_unhashed(&selem->snode);
-}
-
-static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem)
-{
- return !hlist_unhashed(&selem->map_node);
-}
-
-static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap,
- struct sock *sk, void *value,
- bool charge_omem)
-{
- struct bpf_sk_storage_elem *selem;
-
- if (charge_omem && omem_charge(sk, smap->elem_size))
- return NULL;
-
- selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
- if (selem) {
- if (value)
- memcpy(SDATA(selem)->data, value, smap->map.value_size);
- return selem;
- }
-
- if (charge_omem)
- atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
-
- return NULL;
-}
-
-/* sk_storage->lock must be held and selem->sk_storage == sk_storage.
- * The caller must ensure selem->smap is still valid to be
- * dereferenced for its smap->elem_size and smap->cache_idx.
- */
-static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage,
- struct bpf_sk_storage_elem *selem,
- bool uncharge_omem)
-{
- struct bpf_sk_storage_map *smap;
- bool free_sk_storage;
- struct sock *sk;
-
- smap = rcu_dereference(SDATA(selem)->smap);
- sk = sk_storage->sk;
-
- /* All uncharging on sk->sk_omem_alloc must be done first.
- * sk may be freed once the last selem is unlinked from sk_storage.
- */
- if (uncharge_omem)
- atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
-
- free_sk_storage = hlist_is_singular_node(&selem->snode,
- &sk_storage->list);
- if (free_sk_storage) {
- atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc);
- sk_storage->sk = NULL;
- /* After this RCU_INIT, sk may be freed and cannot be used */
- RCU_INIT_POINTER(sk->sk_bpf_storage, NULL);
-
- /* sk_storage is not freed now. sk_storage->lock is
- * still held and raw_spin_unlock_bh(&sk_storage->lock)
- * will be done by the caller.
- *
- * Although the unlock will be done under
- * rcu_read_lock(), it is more intutivie to
- * read if kfree_rcu(sk_storage, rcu) is done
- * after the raw_spin_unlock_bh(&sk_storage->lock).
- *
- * Hence, a "bool free_sk_storage" is returned
- * to the caller which then calls the kfree_rcu()
- * after unlock.
- */
- }
- hlist_del_init_rcu(&selem->snode);
- if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) ==
- SDATA(selem))
- RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL);
-
- kfree_rcu(selem, rcu);
-
- return free_sk_storage;
-}
-
-static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
-{
- struct bpf_sk_storage *sk_storage;
- bool free_sk_storage = false;
-
- if (unlikely(!selem_linked_to_sk(selem)))
- /* selem has already been unlinked from sk */
- return;
-
- sk_storage = rcu_dereference(selem->sk_storage);
- raw_spin_lock_bh(&sk_storage->lock);
- if (likely(selem_linked_to_sk(selem)))
- free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
- raw_spin_unlock_bh(&sk_storage->lock);
-
- if (free_sk_storage)
- kfree_rcu(sk_storage, rcu);
-}
-
-static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
- struct bpf_sk_storage_elem *selem)
-{
- RCU_INIT_POINTER(selem->sk_storage, sk_storage);
- hlist_add_head(&selem->snode, &sk_storage->list);
-}
-
-static void selem_unlink_map(struct bpf_sk_storage_elem *selem)
-{
- struct bpf_sk_storage_map *smap;
- struct bucket *b;
-
- if (unlikely(!selem_linked_to_map(selem)))
- /* selem has already be unlinked from smap */
- return;
-
- smap = rcu_dereference(SDATA(selem)->smap);
- b = select_bucket(smap, selem);
- raw_spin_lock_bh(&b->lock);
- if (likely(selem_linked_to_map(selem)))
- hlist_del_init_rcu(&selem->map_node);
- raw_spin_unlock_bh(&b->lock);
-}
-
-static void selem_link_map(struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *selem)
-{
- struct bucket *b = select_bucket(smap, selem);
-
- raw_spin_lock_bh(&b->lock);
- RCU_INIT_POINTER(SDATA(selem)->smap, smap);
- hlist_add_head_rcu(&selem->map_node, &b->list);
- raw_spin_unlock_bh(&b->lock);
-}
-
-static void selem_unlink(struct bpf_sk_storage_elem *selem)
-{
- /* Always unlink from map before unlinking from sk_storage
- * because selem will be freed after successfully unlinked from
- * the sk_storage.
- */
- selem_unlink_map(selem);
- selem_unlink_sk(selem);
-}
-
-static struct bpf_sk_storage_data *
-__sk_storage_lookup(struct bpf_sk_storage *sk_storage,
- struct bpf_sk_storage_map *smap,
- bool cacheit_lockit)
-{
- struct bpf_sk_storage_data *sdata;
- struct bpf_sk_storage_elem *selem;
-
- /* Fast path (cache hit) */
- sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]);
- if (sdata && rcu_access_pointer(sdata->smap) == smap)
- return sdata;
-
- /* Slow path (cache miss) */
- hlist_for_each_entry_rcu(selem, &sk_storage->list, snode)
- if (rcu_access_pointer(SDATA(selem)->smap) == smap)
- break;
-
- if (!selem)
- return NULL;
-
- sdata = SDATA(selem);
- if (cacheit_lockit) {
- /* spinlock is needed to avoid racing with the
- * parallel delete. Otherwise, publishing an already
- * deleted sdata to the cache will become a use-after-free
- * problem in the next __sk_storage_lookup().
- */
- raw_spin_lock_bh(&sk_storage->lock);
- if (selem_linked_to_sk(selem))
- rcu_assign_pointer(sk_storage->cache[smap->cache_idx],
- sdata);
- raw_spin_unlock_bh(&sk_storage->lock);
- }
-
- return sdata;
-}
-
-static struct bpf_sk_storage_data *
-sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
-{
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_map *smap;
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage)
return NULL;
- smap = (struct bpf_sk_storage_map *)map;
- return __sk_storage_lookup(sk_storage, smap, cacheit_lockit);
-}
-
-static int check_flags(const struct bpf_sk_storage_data *old_sdata,
- u64 map_flags)
-{
- if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
- /* elem already exists */
- return -EEXIST;
-
- if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
- /* elem doesn't exist, cannot update it */
- return -ENOENT;
-
- return 0;
-}
-
-static int sk_storage_alloc(struct sock *sk,
- struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *first_selem)
-{
- struct bpf_sk_storage *prev_sk_storage, *sk_storage;
- int err;
-
- err = omem_charge(sk, sizeof(*sk_storage));
- if (err)
- return err;
-
- sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN);
- if (!sk_storage) {
- err = -ENOMEM;
- goto uncharge;
- }
- INIT_HLIST_HEAD(&sk_storage->list);
- raw_spin_lock_init(&sk_storage->lock);
- sk_storage->sk = sk;
-
- __selem_link_sk(sk_storage, first_selem);
- selem_link_map(smap, first_selem);
- /* Publish sk_storage to sk. sk->sk_lock cannot be acquired.
- * Hence, atomic ops is used to set sk->sk_bpf_storage
- * from NULL to the newly allocated sk_storage ptr.
- *
- * From now on, the sk->sk_bpf_storage pointer is protected
- * by the sk_storage->lock. Hence, when freeing
- * the sk->sk_bpf_storage, the sk_storage->lock must
- * be held before setting sk->sk_bpf_storage to NULL.
- */
- prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage,
- NULL, sk_storage);
- if (unlikely(prev_sk_storage)) {
- selem_unlink_map(first_selem);
- err = -EAGAIN;
- goto uncharge;
-
- /* Note that even first_selem was linked to smap's
- * bucket->list, first_selem can be freed immediately
- * (instead of kfree_rcu) because
- * bpf_sk_storage_map_free() does a
- * synchronize_rcu() before walking the bucket->list.
- * Hence, no one is accessing selem from the
- * bucket->list under rcu_read_lock().
- */
- }
-
- return 0;
-
-uncharge:
- kfree(sk_storage);
- atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc);
- return err;
-}
-
-/* sk cannot be going away because it is linking new elem
- * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
- * Otherwise, it will become a leak (and other memory issues
- * during map destruction).
- */
-static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
- struct bpf_map *map,
- void *value,
- u64 map_flags)
-{
- struct bpf_sk_storage_data *old_sdata = NULL;
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_map *smap;
- int err;
-
- /* BPF_EXIST and BPF_NOEXIST cannot be both set */
- if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
- /* BPF_F_LOCK can only be used in a value with spin_lock */
- unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
- return ERR_PTR(-EINVAL);
-
- smap = (struct bpf_sk_storage_map *)map;
- sk_storage = rcu_dereference(sk->sk_bpf_storage);
- if (!sk_storage || hlist_empty(&sk_storage->list)) {
- /* Very first elem for this sk */
- err = check_flags(NULL, map_flags);
- if (err)
- return ERR_PTR(err);
-
- selem = selem_alloc(smap, sk, value, true);
- if (!selem)
- return ERR_PTR(-ENOMEM);
-
- err = sk_storage_alloc(sk, smap, selem);
- if (err) {
- kfree(selem);
- atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
- return ERR_PTR(err);
- }
-
- return SDATA(selem);
- }
-
- if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
- /* Hoping to find an old_sdata to do inline update
- * such that it can avoid taking the sk_storage->lock
- * and changing the lists.
- */
- old_sdata = __sk_storage_lookup(sk_storage, smap, false);
- err = check_flags(old_sdata, map_flags);
- if (err)
- return ERR_PTR(err);
- if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) {
- copy_map_value_locked(map, old_sdata->data,
- value, false);
- return old_sdata;
- }
- }
-
- raw_spin_lock_bh(&sk_storage->lock);
-
- /* Recheck sk_storage->list under sk_storage->lock */
- if (unlikely(hlist_empty(&sk_storage->list))) {
- /* A parallel del is happening and sk_storage is going
- * away. It has just been checked before, so very
- * unlikely. Return instead of retry to keep things
- * simple.
- */
- err = -EAGAIN;
- goto unlock_err;
- }
-
- old_sdata = __sk_storage_lookup(sk_storage, smap, false);
- err = check_flags(old_sdata, map_flags);
- if (err)
- goto unlock_err;
-
- if (old_sdata && (map_flags & BPF_F_LOCK)) {
- copy_map_value_locked(map, old_sdata->data, value, false);
- selem = SELEM(old_sdata);
- goto unlock;
- }
-
- /* sk_storage->lock is held. Hence, we are sure
- * we can unlink and uncharge the old_sdata successfully
- * later. Hence, instead of charging the new selem now
- * and then uncharge the old selem later (which may cause
- * a potential but unnecessary charge failure), avoid taking
- * a charge at all here (the "!old_sdata" check) and the
- * old_sdata will not be uncharged later during __selem_unlink_sk().
- */
- selem = selem_alloc(smap, sk, value, !old_sdata);
- if (!selem) {
- err = -ENOMEM;
- goto unlock_err;
- }
-
- /* First, link the new selem to the map */
- selem_link_map(smap, selem);
-
- /* Second, link (and publish) the new selem to sk_storage */
- __selem_link_sk(sk_storage, selem);
-
- /* Third, remove old selem, SELEM(old_sdata) */
- if (old_sdata) {
- selem_unlink_map(SELEM(old_sdata));
- __selem_unlink_sk(sk_storage, SELEM(old_sdata), false);
- }
-
-unlock:
- raw_spin_unlock_bh(&sk_storage->lock);
- return SDATA(selem);
-
-unlock_err:
- raw_spin_unlock_bh(&sk_storage->lock);
- return ERR_PTR(err);
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit);
}
-static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
+static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
- sdata = sk_storage_lookup(sk, map, false);
+ sdata = bpf_sk_storage_lookup(sk, map, false);
if (!sdata)
return -ENOENT;
- selem_unlink(SELEM(sdata));
+ bpf_selem_unlink(SELEM(sdata));
return 0;
}
-static u16 cache_idx_get(void)
-{
- u64 min_usage = U64_MAX;
- u16 i, res = 0;
-
- spin_lock(&cache_idx_lock);
-
- for (i = 0; i < BPF_SK_STORAGE_CACHE_SIZE; i++) {
- if (cache_idx_usage_counts[i] < min_usage) {
- min_usage = cache_idx_usage_counts[i];
- res = i;
-
- /* Found a free cache_idx */
- if (!min_usage)
- break;
- }
- }
- cache_idx_usage_counts[res]++;
-
- spin_unlock(&cache_idx_lock);
-
- return res;
-}
-
-static void cache_idx_free(u16 idx)
-{
- spin_lock(&cache_idx_lock);
- cache_idx_usage_counts[idx]--;
- spin_unlock(&cache_idx_lock);
-}
-
/* Called by __sk_destruct() & bpf_sk_storage_clone() */
void bpf_sk_storage_free(struct sock *sk)
{
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage *sk_storage;
bool free_sk_storage = false;
struct hlist_node *n;
@@ -565,7 +63,7 @@ void bpf_sk_storage_free(struct sock *sk)
* Thus, no elem can be added-to or deleted-from the
* sk_storage->list by the bpf_prog or by the bpf-map's syscall.
*
- * It is racing with bpf_sk_storage_map_free() alone
+ * It is racing with bpf_local_storage_map_free() alone
* when unlinking elem from the sk_storage->list and
* the map's bucket->list.
*/
@@ -574,8 +72,9 @@ void bpf_sk_storage_free(struct sock *sk)
/* Always unlink from map before unlinking from
* sk_storage.
*/
- selem_unlink_map(selem);
- free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
+ bpf_selem_unlink_map(selem);
+ free_sk_storage = bpf_selem_unlink_storage_nolock(sk_storage,
+ selem, true);
}
raw_spin_unlock_bh(&sk_storage->lock);
rcu_read_unlock();
@@ -586,130 +85,22 @@ void bpf_sk_storage_free(struct sock *sk)
static void bpf_sk_storage_map_free(struct bpf_map *map)
{
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage_map *smap;
- struct bucket *b;
- unsigned int i;
-
- smap = (struct bpf_sk_storage_map *)map;
-
- cache_idx_free(smap->cache_idx);
-
- /* Note that this map might be concurrently cloned from
- * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
- * RCU read section to finish before proceeding. New RCU
- * read sections should be prevented via bpf_map_inc_not_zero.
- */
- synchronize_rcu();
+ struct bpf_local_storage_map *smap;
- /* bpf prog and the userspace can no longer access this map
- * now. No new selem (of this map) can be added
- * to the sk->sk_bpf_storage or to the map bucket's list.
- *
- * The elem of this map can be cleaned up here
- * or
- * by bpf_sk_storage_free() during __sk_destruct().
- */
- for (i = 0; i < (1U << smap->bucket_log); i++) {
- b = &smap->buckets[i];
-
- rcu_read_lock();
- /* No one is adding to b->list now */
- while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)),
- struct bpf_sk_storage_elem,
- map_node))) {
- selem_unlink(selem);
- cond_resched_rcu();
- }
- rcu_read_unlock();
- }
-
- /* bpf_sk_storage_free() may still need to access the map.
- * e.g. bpf_sk_storage_free() has unlinked selem from the map
- * which then made the above while((selem = ...)) loop
- * exited immediately.
- *
- * However, the bpf_sk_storage_free() still needs to access
- * the smap->elem_size to do the uncharging in
- * __selem_unlink_sk().
- *
- * Hence, wait another rcu grace period for the
- * bpf_sk_storage_free() to finish.
- */
- synchronize_rcu();
-
- kvfree(smap->buckets);
- kfree(map);
-}
-
-/* U16_MAX is much more than enough for sk local storage
- * considering a tcp_sock is ~2k.
- */
-#define MAX_VALUE_SIZE \
- min_t(u32, \
- (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \
- (U16_MAX - sizeof(struct bpf_sk_storage_elem)))
-
-static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
-{
- if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
- !(attr->map_flags & BPF_F_NO_PREALLOC) ||
- attr->max_entries ||
- attr->key_size != sizeof(int) || !attr->value_size ||
- /* Enforce BTF for userspace sk dumping */
- !attr->btf_key_type_id || !attr->btf_value_type_id)
- return -EINVAL;
-
- if (!bpf_capable())
- return -EPERM;
-
- if (attr->value_size > MAX_VALUE_SIZE)
- return -E2BIG;
-
- return 0;
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
+ bpf_local_storage_map_free(smap);
}
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_sk_storage_map *smap;
- unsigned int i;
- u32 nbuckets;
- u64 cost;
- int ret;
-
- smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- bpf_map_init_from_attr(&smap->map, attr);
-
- nbuckets = roundup_pow_of_two(num_possible_cpus());
- /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
- nbuckets = max_t(u32, 2, nbuckets);
- smap->bucket_log = ilog2(nbuckets);
- cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
-
- ret = bpf_map_charge_init(&smap->map.memory, cost);
- if (ret < 0) {
- kfree(smap);
- return ERR_PTR(ret);
- }
-
- smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
- GFP_USER | __GFP_NOWARN);
- if (!smap->buckets) {
- bpf_map_charge_finish(&smap->map.memory);
- kfree(smap);
- return ERR_PTR(-ENOMEM);
- }
+ struct bpf_local_storage_map *smap;
- for (i = 0; i < nbuckets; i++) {
- INIT_HLIST_HEAD(&smap->buckets[i].list);
- raw_spin_lock_init(&smap->buckets[i].lock);
- }
-
- smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
- smap->cache_idx = cache_idx_get();
+ smap = bpf_local_storage_map_alloc(attr);
+ if (IS_ERR(smap))
+ return ERR_CAST(smap);
+ smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache);
return &smap->map;
}
@@ -719,33 +110,16 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static int bpf_sk_storage_map_check_btf(const struct bpf_map *map,
- const struct btf *btf,
- const struct btf_type *key_type,
- const struct btf_type *value_type)
-{
- u32 int_data;
-
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
- return -EINVAL;
-
- int_data = *(u32 *)(key_type + 1);
- if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
- return -EINVAL;
-
- return 0;
-}
-
static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
struct socket *sock;
int fd, err;
fd = *(int *)key;
sock = sockfd_lookup(fd, &err);
if (sock) {
- sdata = sk_storage_lookup(sock->sk, map, true);
+ sdata = bpf_sk_storage_lookup(sock->sk, map, true);
sockfd_put(sock);
return sdata ? sdata->data : NULL;
}
@@ -756,14 +130,16 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
struct socket *sock;
int fd, err;
fd = *(int *)key;
sock = sockfd_lookup(fd, &err);
if (sock) {
- sdata = sk_storage_update(sock->sk, map, value, map_flags);
+ sdata = bpf_local_storage_update(
+ sock->sk, (struct bpf_local_storage_map *)map, value,
+ map_flags);
sockfd_put(sock);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -779,7 +155,7 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
fd = *(int *)key;
sock = sockfd_lookup(fd, &err);
if (sock) {
- err = sk_storage_delete(sock->sk, map);
+ err = bpf_sk_storage_del(sock->sk, map);
sockfd_put(sock);
return err;
}
@@ -787,14 +163,14 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
-static struct bpf_sk_storage_elem *
+static struct bpf_local_storage_elem *
bpf_sk_storage_clone_elem(struct sock *newsk,
- struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *selem)
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
{
- struct bpf_sk_storage_elem *copy_selem;
+ struct bpf_local_storage_elem *copy_selem;
- copy_selem = selem_alloc(smap, newsk, NULL, true);
+ copy_selem = bpf_selem_alloc(smap, newsk, NULL, true);
if (!copy_selem)
return NULL;
@@ -810,9 +186,9 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
{
- struct bpf_sk_storage *new_sk_storage = NULL;
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_elem *selem;
+ struct bpf_local_storage *new_sk_storage = NULL;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
int ret = 0;
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
@@ -824,8 +200,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
goto out;
hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
- struct bpf_sk_storage_elem *copy_selem;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage_elem *copy_selem;
+ struct bpf_local_storage_map *smap;
struct bpf_map *map;
smap = rcu_dereference(SDATA(selem)->smap);
@@ -833,7 +209,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
continue;
/* Note that for lockless listeners adding new element
- * here can race with cleanup in bpf_sk_storage_map_free.
+ * here can race with cleanup in bpf_local_storage_map_free.
* Try to grab map refcnt to make sure that it's still
* alive and prevent concurrent removal.
*/
@@ -849,10 +225,10 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
}
if (new_sk_storage) {
- selem_link_map(smap, copy_selem);
- __selem_link_sk(new_sk_storage, copy_selem);
+ bpf_selem_link_map(smap, copy_selem);
+ bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
} else {
- ret = sk_storage_alloc(newsk, smap, copy_selem);
+ ret = bpf_local_storage_alloc(newsk, smap, copy_selem);
if (ret) {
kfree(copy_selem);
atomic_sub(smap->elem_size,
@@ -861,7 +237,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
goto out;
}
- new_sk_storage = rcu_dereference(copy_selem->sk_storage);
+ new_sk_storage =
+ rcu_dereference(copy_selem->local_storage);
}
bpf_map_put(map);
}
@@ -879,12 +256,12 @@ out:
BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
void *, value, u64, flags)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
- if (flags > BPF_SK_STORAGE_GET_F_CREATE)
+ if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
return (unsigned long)NULL;
- sdata = sk_storage_lookup(sk, map, true);
+ sdata = bpf_sk_storage_lookup(sk, map, true);
if (sdata)
return (unsigned long)sdata->data;
@@ -895,7 +272,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
* destruction).
*/
refcount_inc_not_zero(&sk->sk_refcnt)) {
- sdata = sk_storage_update(sk, map, value, BPF_NOEXIST);
+ sdata = bpf_local_storage_update(
+ sk, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST);
/* sk must be a fullsock (guaranteed by verifier),
* so sock_gen_put() is unnecessary.
*/
@@ -909,10 +288,13 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
{
+ if (!sk || !sk_fullsock(sk))
+ return -EINVAL;
+
if (refcount_inc_not_zero(&sk->sk_refcnt)) {
int err;
- err = sk_storage_delete(sk, map);
+ err = bpf_sk_storage_del(sk, map);
sock_put(sk);
return err;
}
@@ -920,18 +302,53 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
return -ENOENT;
}
+static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap,
+ void *owner, u32 size)
+{
+ struct sock *sk = (struct sock *)owner;
+
+ /* same check as in sock_kmalloc() */
+ if (size <= sysctl_optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ atomic_add(size, &sk->sk_omem_alloc);
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap,
+ void *owner, u32 size)
+{
+ struct sock *sk = owner;
+
+ atomic_sub(size, &sk->sk_omem_alloc);
+}
+
+static struct bpf_local_storage __rcu **
+bpf_sk_storage_ptr(void *owner)
+{
+ struct sock *sk = owner;
+
+ return &sk->sk_bpf_storage;
+}
+
static int sk_storage_map_btf_id;
const struct bpf_map_ops sk_storage_map_ops = {
- .map_alloc_check = bpf_sk_storage_map_alloc_check,
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
.map_alloc = bpf_sk_storage_map_alloc,
.map_free = bpf_sk_storage_map_free,
.map_get_next_key = notsupp_get_next_key,
.map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
.map_update_elem = bpf_fd_sk_storage_update_elem,
.map_delete_elem = bpf_fd_sk_storage_delete_elem,
- .map_check_btf = bpf_sk_storage_map_check_btf,
- .map_btf_name = "bpf_sk_storage_map",
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_name = "bpf_local_storage_map",
.map_btf_id = &sk_storage_map_btf_id,
+ .map_local_storage_charge = bpf_sk_storage_charge,
+ .map_local_storage_uncharge = bpf_sk_storage_uncharge,
+ .map_owner_storage_ptr = bpf_sk_storage_ptr,
};
const struct bpf_func_proto bpf_sk_storage_get_proto = {
@@ -939,7 +356,7 @@ const struct bpf_func_proto bpf_sk_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_SOCKET,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
@@ -959,7 +376,81 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_SOCKET,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
+static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
+{
+ const struct btf *btf_vmlinux;
+ const struct btf_type *t;
+ const char *tname;
+ u32 btf_id;
+
+ if (prog->aux->dst_prog)
+ return false;
+
+ /* Ensure the tracing program is not tracing
+ * any bpf_sk_storage*() function and also
+ * use the bpf_sk_storage_(get|delete) helper.
+ */
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_ITER:
+ case BPF_TRACE_RAW_TP:
+ /* bpf_sk_storage has no trace point */
+ return true;
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ btf_vmlinux = bpf_get_btf_vmlinux();
+ btf_id = prog->aux->attach_btf_id;
+ t = btf_type_by_id(btf_vmlinux, btf_id);
+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+ return !!strncmp(tname, "bpf_sk_storage",
+ strlen("bpf_sk_storage"));
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags)
+{
+ if (in_irq() || in_nmi())
+ return (unsigned long)NULL;
+
+ return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
+}
+
+BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
+ struct sock *, sk)
+{
+ if (in_irq() || in_nmi())
+ return -EPERM;
+
+ return ____bpf_sk_storage_delete(map, sk);
+}
+
+const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = {
+ .func = bpf_sk_storage_get_tracing,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+ .allowed = bpf_sk_storage_tracing_allowed,
+};
+
+const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = {
+ .func = bpf_sk_storage_delete_tracing,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ .allowed = bpf_sk_storage_tracing_allowed,
};
struct bpf_sk_storage_diag {
@@ -1022,7 +513,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
u32 nr_maps = 0;
int rem, err;
- /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
+ /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as
* the map_alloc_check() side also does.
*/
if (!bpf_capable())
@@ -1072,13 +563,13 @@ err_free:
}
EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
-static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb)
+static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
{
struct nlattr *nla_stg, *nla_value;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage_map *smap;
/* It cannot exceed max nlattr's payload */
- BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE);
+ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE);
nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
if (!nla_stg)
@@ -1114,9 +605,9 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
{
/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
unsigned int diag_size = nla_total_size(0);
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
struct nlattr *nla_stgs;
unsigned int saved_len;
int err = 0;
@@ -1169,8 +660,8 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
{
/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
unsigned int diag_size = nla_total_size(0);
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_data *sdata;
struct nlattr *nla_stgs;
unsigned int saved_len;
int err = 0;
@@ -1197,8 +688,8 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
saved_len = skb->len;
for (i = 0; i < diag->nr_maps; i++) {
- sdata = __sk_storage_lookup(sk_storage,
- (struct bpf_sk_storage_map *)diag->maps[i],
+ sdata = bpf_local_storage_lookup(sk_storage,
+ (struct bpf_local_storage_map *)diag->maps[i],
false);
if (!sdata)
@@ -1235,19 +726,20 @@ struct bpf_iter_seq_sk_storage_map_info {
unsigned skip_elems;
};
-static struct bpf_sk_storage_elem *
+static struct bpf_local_storage_elem *
bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
- struct bpf_sk_storage_elem *prev_selem)
+ struct bpf_local_storage_elem *prev_selem)
+ __acquires(RCU) __releases(RCU)
{
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_elem *selem;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
u32 skip_elems = info->skip_elems;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage_map *smap;
u32 bucket_id = info->bucket_id;
u32 i, count, n_buckets;
- struct bucket *b;
+ struct bpf_local_storage_map_bucket *b;
- smap = (struct bpf_sk_storage_map *)info->map;
+ smap = (struct bpf_local_storage_map *)info->map;
n_buckets = 1U << smap->bucket_log;
if (bucket_id >= n_buckets)
return NULL;
@@ -1256,16 +748,16 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
selem = prev_selem;
count = 0;
while (selem) {
- selem = hlist_entry_safe(selem->map_node.next,
- struct bpf_sk_storage_elem, map_node);
+ selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)),
+ struct bpf_local_storage_elem, map_node);
if (!selem) {
/* not found, unlock and go to the next bucket */
b = &smap->buckets[bucket_id++];
- raw_spin_unlock_bh(&b->lock);
+ rcu_read_unlock();
skip_elems = 0;
break;
}
- sk_storage = rcu_dereference_raw(selem->sk_storage);
+ sk_storage = rcu_dereference(selem->local_storage);
if (sk_storage) {
info->skip_elems = skip_elems + count;
return selem;
@@ -1275,10 +767,10 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
b = &smap->buckets[i];
- raw_spin_lock_bh(&b->lock);
+ rcu_read_lock();
count = 0;
- hlist_for_each_entry(selem, &b->list, map_node) {
- sk_storage = rcu_dereference_raw(selem->sk_storage);
+ hlist_for_each_entry_rcu(selem, &b->list, map_node) {
+ sk_storage = rcu_dereference(selem->local_storage);
if (sk_storage && count >= skip_elems) {
info->bucket_id = i;
info->skip_elems = count;
@@ -1286,7 +778,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
}
count++;
}
- raw_spin_unlock_bh(&b->lock);
+ rcu_read_unlock();
skip_elems = 0;
}
@@ -1297,7 +789,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct bpf_sk_storage_elem *selem;
+ struct bpf_local_storage_elem *selem;
selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL);
if (!selem)
@@ -1330,11 +822,11 @@ DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta,
void *value)
static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
- struct bpf_sk_storage_elem *selem)
+ struct bpf_local_storage_elem *selem)
{
struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
struct bpf_iter__bpf_sk_storage_map ctx = {};
- struct bpf_sk_storage *sk_storage;
+ struct bpf_local_storage *sk_storage;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int ret = 0;
@@ -1345,8 +837,8 @@ static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
ctx.meta = &meta;
ctx.map = info->map;
if (selem) {
- sk_storage = rcu_dereference_raw(selem->sk_storage);
- ctx.sk = sk_storage->sk;
+ sk_storage = rcu_dereference(selem->local_storage);
+ ctx.sk = sk_storage->owner;
ctx.value = SDATA(selem)->data;
}
ret = bpf_iter_run_prog(prog, &ctx);
@@ -1361,18 +853,12 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
}
static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
- struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
- struct bpf_sk_storage_map *smap;
- struct bucket *b;
-
- if (!v) {
+ if (!v)
(void)__bpf_sk_storage_map_seq_show(seq, v);
- } else {
- smap = (struct bpf_sk_storage_map *)info->map;
- b = &smap->buckets[info->bucket_id];
- raw_spin_unlock_bh(&b->lock);
- }
+ else
+ rcu_read_unlock();
}
static int bpf_iter_init_sk_storage_map(void *priv_data,
@@ -1437,6 +923,8 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
.target = "bpf_sk_storage_map",
.attach_target = bpf_iter_attach_map,
.detach_target = bpf_iter_detach_map,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 639745d4f3b9..15ab9ffb27fe 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -623,10 +623,11 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
while (length && iov_iter_count(from)) {
struct page *pages[MAX_SKB_FRAGS];
+ struct page *last_head = NULL;
size_t start;
ssize_t copied;
unsigned long truesize;
- int n = 0;
+ int refs, n = 0;
if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
@@ -649,13 +650,37 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
} else {
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
}
- while (copied) {
+ for (refs = 0; copied != 0; start = 0) {
int size = min_t(int, copied, PAGE_SIZE - start);
- skb_fill_page_desc(skb, frag++, pages[n], start, size);
- start = 0;
+ struct page *head = compound_head(pages[n]);
+
+ start += (pages[n] - head) << PAGE_SHIFT;
copied -= size;
n++;
+ if (frag) {
+ skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1];
+
+ if (head == skb_frag_page(last) &&
+ start == skb_frag_off(last) + skb_frag_size(last)) {
+ skb_frag_size_add(last, size);
+ /* We combined this page, we need to release
+ * a reference. Since compound pages refcount
+ * is shared among many pages, batch the refcount
+ * adjustments to limit false sharing.
+ */
+ last_head = head;
+ refs++;
+ continue;
+ }
+ }
+ if (refs) {
+ page_ref_sub(last_head, refs);
+ refs = 0;
+ }
+ skb_fill_page_desc(skb, frag++, head, start, size);
}
+ if (refs)
+ page_ref_sub(last_head, refs);
}
return 0;
}
@@ -684,7 +709,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
EXPORT_SYMBOL(zerocopy_sg_from_iter);
/**
- * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator
+ * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
* and update a checksum.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
@@ -696,8 +721,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len,
__wsum *csump)
{
- return __skb_datagram_iter(skb, offset, to, len, true,
- csum_and_copy_to_iter, csump);
+ struct csum_state csdata = { .csum = *csump };
+ int ret;
+
+ ret = __skb_datagram_iter(skb, offset, to, len, true,
+ csum_and_copy_to_iter, &csdata);
+ if (ret)
+ return ret;
+
+ *csump = csdata.csum;
+ return 0;
}
/**
diff --git a/net/core/dev.c b/net/core/dev.c
index 4906b44af850..449b45b843d4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -98,6 +98,7 @@
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
+#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/pkt_sched.h>
@@ -144,6 +145,7 @@
#include <linux/indirect_call_wrapper.h>
#include <net/devlink.h>
#include <linux/pm_runtime.h>
+#include <linux/prandom.h>
#include "net-sysfs.h"
@@ -1067,19 +1069,6 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
-struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
-{
- struct net_device *dev;
-
- ASSERT_RTNL();
- for_each_netdev(net, dev)
- if (dev->type == type)
- return dev;
-
- return NULL;
-}
-EXPORT_SYMBOL(__dev_getfirstbyhwtype);
-
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev, *ret = NULL;
@@ -1130,7 +1119,7 @@ EXPORT_SYMBOL(__dev_get_by_flags);
* @name: name string
*
* Network device names need to be valid file names to
- * to allow sysfs to work. We also disallow any kind of
+ * allow sysfs to work. We also disallow any kind of
* whitespace.
*/
bool dev_valid_name(const char *name)
@@ -1468,6 +1457,25 @@ void netdev_state_change(struct net_device *dev)
EXPORT_SYMBOL(netdev_state_change);
/**
+ * __netdev_notify_peers - notify network peers about existence of @dev,
+ * to be called when rtnl lock is already held.
+ * @dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void __netdev_notify_peers(struct net_device *dev)
+{
+ ASSERT_RTNL();
+ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+}
+EXPORT_SYMBOL(__netdev_notify_peers);
+
+/**
* netdev_notify_peers - notify network peers about existence of @dev
* @dev: network device
*
@@ -1480,8 +1488,7 @@ EXPORT_SYMBOL(netdev_state_change);
void netdev_notify_peers(struct net_device *dev)
{
rtnl_lock();
- call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
- call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+ __netdev_notify_peers(dev);
rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);
@@ -3204,7 +3211,7 @@ int skb_checksum_help(struct sk_buff *skb)
if (skb->ip_summed == CHECKSUM_COMPLETE)
goto out_set_summed;
- if (unlikely(skb_shinfo(skb)->gso_size)) {
+ if (unlikely(skb_is_gso(skb))) {
skb_warn_bad_offload(skb);
return -EINVAL;
}
@@ -3493,6 +3500,11 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (gso_segs > dev->gso_max_segs)
return features & ~NETIF_F_GSO_MASK;
+ if (!skb_shinfo(skb)->gso_type) {
+ skb_warn_bad_offload(skb);
+ return features & ~NETIF_F_GSO_MASK;
+ }
+
/* Support for GSO partial features requires software
* intervention before we can actually process the packets
* so we need to strip support for any partial features now
@@ -3557,6 +3569,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
dev_queue_xmit_nit(skb, dev);
len = skb->len;
+ PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
trace_net_dev_start_xmit(skb, dev);
rc = netdev_start_xmit(skb, dev, txq, more);
trace_net_dev_xmit(skb, rc, dev, len);
@@ -3864,6 +3877,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
return skb;
/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
+ qdisc_skb_cb(skb)->mru = 0;
mini_qdisc_bstats_cpu_update(miniq, skb);
switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
@@ -4129,6 +4143,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
if (!skb)
goto out;
+ PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
@@ -4176,7 +4191,7 @@ int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
}
EXPORT_SYMBOL(dev_queue_xmit_accel);
-int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
struct net_device *dev = skb->dev;
struct sk_buff *orig_skb = skb;
@@ -4194,6 +4209,7 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
skb_set_queue_mapping(skb, queue_id);
txq = skb_get_tx_queue(dev, skb);
+ PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
local_bh_disable();
@@ -4205,17 +4221,13 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
dev_xmit_recursion_dec();
local_bh_enable();
-
- if (!dev_xmit_complete(ret))
- kfree_skb(skb);
-
return ret;
drop:
atomic_long_inc(&dev->tx_dropped);
kfree_skb_list(skb);
return NET_XMIT_DROP;
}
-EXPORT_SYMBOL(dev_direct_xmit);
+EXPORT_SYMBOL(__dev_direct_xmit);
/*************************************************************************
* Receiver routines
@@ -4840,6 +4852,21 @@ int netif_rx_ni(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_rx_ni);
+int netif_rx_any_context(struct sk_buff *skb)
+{
+ /*
+ * If invoked from contexts which do not invoke bottom half
+ * processing either at return from interrupt or when softrqs are
+ * reenabled, use netif_rx_ni() which invokes bottomhalf processing
+ * directly.
+ */
+ if (in_interrupt())
+ return netif_rx(skb);
+ else
+ return netif_rx_ni(skb);
+}
+EXPORT_SYMBOL(netif_rx_any_context);
+
static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -4914,7 +4941,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
- struct net_device *orig_dev)
+ struct net_device *orig_dev, bool *another)
{
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
@@ -4934,6 +4961,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
}
qdisc_skb_cb(skb)->pkt_len = skb->len;
+ qdisc_skb_cb(skb)->mru = 0;
skb->tc_at_ingress = 1;
mini_qdisc_bstats_cpu_update(miniq, skb);
@@ -4958,7 +4986,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
- skb_do_redirect(skb);
+ if (skb_do_redirect(skb) == -EAGAIN) {
+ __skb_pull(skb, skb->mac_len);
+ *another = true;
+ break;
+ }
return NULL;
case TC_ACT_CONSUMED:
return NULL;
@@ -5147,7 +5179,12 @@ another_round:
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
- skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+ bool another = false;
+
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
+ &another);
+ if (another)
+ goto another_round;
if (!skb)
goto out;
@@ -5192,7 +5229,7 @@ skip_classify:
}
}
- if (unlikely(skb_vlan_tag_present(skb))) {
+ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
if (skb_vlan_tag_get_id(skb)) {
/* Vlan id is non 0 and vlan_do_receive() above couldn't
@@ -5441,15 +5478,20 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
if (new) {
u32 i;
+ mutex_lock(&new->aux->used_maps_mutex);
+
/* generic XDP does not work with DEVMAPs that can
* have a bpf_prog installed on an entry
*/
for (i = 0; i < new->aux->used_map_cnt; i++) {
- if (dev_map_can_have_prog(new->aux->used_maps[i]))
- return -EINVAL;
- if (cpu_map_prog_allowed(new->aux->used_maps[i]))
+ if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
+ cpu_map_prog_allowed(new->aux->used_maps[i])) {
+ mutex_unlock(&new->aux->used_maps_mutex);
return -EINVAL;
+ }
}
+
+ mutex_unlock(&new->aux->used_maps_mutex);
}
switch (xdp->command) {
@@ -5621,17 +5663,60 @@ static void flush_backlog(struct work_struct *work)
local_bh_enable();
}
+static bool flush_required(int cpu)
+{
+#if IS_ENABLED(CONFIG_RPS)
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
+ bool do_flush;
+
+ local_irq_disable();
+ rps_lock(sd);
+
+ /* as insertion into process_queue happens with the rps lock held,
+ * process_queue access may race only with dequeue
+ */
+ do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
+ !skb_queue_empty_lockless(&sd->process_queue);
+ rps_unlock(sd);
+ local_irq_enable();
+
+ return do_flush;
+#endif
+ /* without RPS we can't safely check input_pkt_queue: during a
+ * concurrent remote skb_queue_splice() we can detect as empty both
+ * input_pkt_queue and process_queue even if the latter could end-up
+ * containing a lot of packets.
+ */
+ return true;
+}
+
static void flush_all_backlogs(void)
{
+ static cpumask_t flush_cpus;
unsigned int cpu;
+ /* since we are under rtnl lock protection we can use static data
+ * for the cpumask and avoid allocating on stack the possibly
+ * large mask
+ */
+ ASSERT_RTNL();
+
get_online_cpus();
- for_each_online_cpu(cpu)
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
+ cpumask_clear(&flush_cpus);
+ for_each_online_cpu(cpu) {
+ if (flush_required(cpu)) {
+ queue_work_on(cpu, system_highpri_wq,
+ per_cpu_ptr(&flush_works, cpu));
+ cpumask_set_cpu(cpu, &flush_cpus);
+ }
+ }
- for_each_online_cpu(cpu)
+ /* we can have in flight packet[s] on the cpus we are not flushing,
+ * synchronize_net() in rollback_registered_many() will take care of
+ * them
+ */
+ for_each_cpu(cpu, &flush_cpus)
flush_work(per_cpu_ptr(&flush_works, cpu));
put_online_cpus();
@@ -5650,10 +5735,11 @@ static void gro_normal_list(struct napi_struct *napi)
/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
* pass the whole batch up to the stack.
*/
-static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
+static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
{
list_add_tail(&skb->list, &napi->rx_list);
- if (++napi->rx_count >= gro_normal_batch)
+ napi->rx_count += segs;
+ if (napi->rx_count >= gro_normal_batch)
gro_normal_list(napi);
}
@@ -5692,7 +5778,7 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
}
out:
- gro_normal_one(napi, skb);
+ gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
return NET_RX_SUCCESS;
}
@@ -5982,7 +6068,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
{
switch (ret) {
case GRO_NORMAL:
- gro_normal_one(napi, skb);
+ gro_normal_one(napi, skb, 1);
break;
case GRO_DROP:
@@ -6070,7 +6156,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
__skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
if (ret == GRO_NORMAL)
- gro_normal_one(napi, skb);
+ gro_normal_one(napi, skb, 1);
break;
case GRO_DROP:
@@ -6293,7 +6379,7 @@ EXPORT_SYMBOL(__napi_schedule);
* @n: napi context
*
* Test if NAPI routine is already running, and if not mark
- * it as running. This is used as a condition variable
+ * it as running. This is used as a condition variable to
* insure only one NAPI poll instance runs. We also make
* sure there is no pending NAPI disable.
*/
@@ -6381,7 +6467,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
- new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+ NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
@@ -6418,10 +6505,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
#if defined(CONFIG_NET_RX_BUSY_POLL)
-#define BUSY_POLL_BUDGET 8
+static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
+{
+ if (!skip_schedule) {
+ gro_normal_list(napi);
+ __napi_schedule(napi);
+ return;
+ }
+
+ if (napi->gro_bitmask) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(napi, HZ >= 1000);
+ }
+
+ gro_normal_list(napi);
+ clear_bit(NAPI_STATE_SCHED, &napi->state);
+}
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
+ u16 budget)
{
+ bool skip_schedule = false;
+ unsigned long timeout;
int rc;
/* Busy polling means there is a high chance device driver hard irq
@@ -6438,29 +6545,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
local_bh_disable();
+ if (prefer_busy_poll) {
+ napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
+ timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+ if (napi->defer_hard_irqs_count && timeout) {
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
+ skip_schedule = true;
+ }
+ }
+
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
- rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ rc = napi->poll(napi, budget);
/* We can't gro_normal_list() here, because napi->poll() might have
* rearmed the napi (napi_complete_done()) in which case it could
* already be running on another CPU.
*/
- trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock);
- if (rc == BUSY_POLL_BUDGET) {
- /* As the whole budget was spent, we still own the napi so can
- * safely handle the rx_list.
- */
- gro_normal_list(napi);
- __napi_schedule(napi);
- }
+ if (rc == budget)
+ __busy_poll_stop(napi, skip_schedule);
local_bh_enable();
}
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
- void *loop_end_arg)
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
@@ -6488,17 +6599,23 @@ restart:
* we avoid dirtying napi->state as much as we can.
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
- NAPIF_STATE_IN_BUSY_POLL))
+ NAPIF_STATE_IN_BUSY_POLL)) {
+ if (prefer_busy_poll)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
- NAPIF_STATE_SCHED) != val)
+ NAPIF_STATE_SCHED) != val) {
+ if (prefer_busy_poll)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
}
- work = napi_poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
+ work = napi_poll(napi, budget);
+ trace_napi_poll(napi, work, budget);
gro_normal_list(napi);
count:
if (work > 0)
@@ -6511,7 +6628,7 @@ count:
if (unlikely(need_resched())) {
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
+ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
rcu_read_unlock();
cond_resched();
@@ -6522,7 +6639,7 @@ count:
cpu_relax();
}
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
+ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
out:
rcu_read_unlock();
@@ -6533,8 +6650,7 @@ EXPORT_SYMBOL(napi_busy_loop);
static void napi_hash_add(struct napi_struct *napi)
{
- if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
- test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
return;
spin_lock(&napi_hash_lock);
@@ -6555,20 +6671,14 @@ static void napi_hash_add(struct napi_struct *napi)
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
*/
-bool napi_hash_del(struct napi_struct *napi)
+static void napi_hash_del(struct napi_struct *napi)
{
- bool rcu_sync_needed = false;
-
spin_lock(&napi_hash_lock);
- if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
- rcu_sync_needed = true;
- hlist_del_rcu(&napi->napi_hash_node);
- }
+ hlist_del_init_rcu(&napi->napi_hash_node);
+
spin_unlock(&napi_hash_lock);
- return rcu_sync_needed;
}
-EXPORT_SYMBOL_GPL(napi_hash_del);
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
@@ -6580,8 +6690,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
if (!napi_disable_pending(napi) &&
- !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
__napi_schedule_irqoff(napi);
+ }
return HRTIMER_NORESTART;
}
@@ -6600,7 +6712,11 @@ static void init_gro_hash(struct napi_struct *napi)
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
+ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
+ return;
+
INIT_LIST_HEAD(&napi->poll_list);
+ INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
init_gro_hash(napi);
@@ -6635,6 +6751,7 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@@ -6653,18 +6770,19 @@ static void flush_gro_hash(struct napi_struct *napi)
}
/* Must be called in process context */
-void netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del(struct napi_struct *napi)
{
- might_sleep();
- if (napi_hash_del(napi))
- synchronize_net();
- list_del_init(&napi->dev_list);
+ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
+ return;
+
+ napi_hash_del(napi);
+ list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
flush_gro_hash(napi);
napi->gro_bitmask = 0;
}
-EXPORT_SYMBOL(netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del);
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
@@ -6706,6 +6824,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock;
}
+ /* The NAPI context has more processing work, but busy-polling
+ * is preferred. Exit early.
+ */
+ if (napi_prefer_busy_poll(n)) {
+ if (napi_complete_done(n, work)) {
+ /* If timeout is not set, we need to make sure
+ * that the NAPI is re-scheduled.
+ */
+ napi_schedule(n);
+ }
+ goto out_unlock;
+ }
+
if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
@@ -6844,7 +6975,7 @@ bool netdev_has_upper_dev(struct net_device *dev,
EXPORT_SYMBOL(netdev_has_upper_dev);
/**
- * netdev_has_upper_dev_all - Check if device is linked to an upper device
+ * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
* @dev: device
* @upper_dev: upper device to check
*
@@ -8082,7 +8213,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private);
/**
- * netdev_lower_change - Dispatch event about lower device state change
+ * netdev_lower_state_changed - Dispatch event about lower device state change
* @lower_dev: device
* @lower_state_info: state to dispatch
*
@@ -8827,7 +8958,7 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
return dev->netdev_ops->ndo_bpf;
default:
return NULL;
- };
+ }
}
static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
@@ -8846,6 +8977,17 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
return dev->xdp_state[mode].prog;
}
+static u8 dev_xdp_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
+ count++;
+ return count;
+}
+
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
struct bpf_prog *prog = dev_xdp_prog(dev, mode);
@@ -8936,6 +9078,7 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
struct bpf_xdp_link *link, struct bpf_prog *new_prog,
struct bpf_prog *old_prog, u32 flags)
{
+ unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
struct bpf_prog *cur_prog;
enum bpf_xdp_mode mode;
bpf_op_t bpf_op;
@@ -8951,11 +9094,17 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
return -EINVAL;
}
- /* just one XDP mode bit should be set, zero defaults to SKB mode */
- if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
+ /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
+ if (num_modes > 1) {
NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
return -EINVAL;
}
+ /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
+ if (!num_modes && dev_xdp_prog_count(dev) > 1) {
+ NL_SET_ERR_MSG(extack,
+ "More than one program loaded, unset mode is ambiguous");
+ return -EINVAL;
+ }
/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
@@ -9513,6 +9662,22 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+ if (features & NETIF_F_HW_TLS_TX) {
+ bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
+ (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+ bool hw_csum = features & NETIF_F_HW_CSUM;
+
+ if (!ip_csum && !hw_csum) {
+ netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_TX;
+ }
+ }
+
+ if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
+ netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_RX;
+ }
+
return features;
}
@@ -9533,7 +9698,7 @@ int __netdev_update_features(struct net_device *dev)
/* driver might be less strict about feature dependencies */
features = netdev_fix_features(dev, features);
- /* some features can't be enabled if they're off an an upper device */
+ /* some features can't be enabled if they're off on an upper device */
netdev_for_each_upper_dev_rcu(dev, upper, iter)
features = netdev_sync_upper_features(dev, upper, features);
@@ -9688,7 +9853,7 @@ static int netif_alloc_rx_queues(struct net_device *dev)
rx[i].dev = dev;
/* XDP RX-queue setup */
- err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
+ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
if (err < 0)
goto err_rxq_info;
}
@@ -9924,17 +10089,11 @@ int register_netdevice(struct net_device *dev)
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
ret = notifier_to_errno(ret);
if (ret) {
+ /* Expect explicit free_netdev() on failure */
+ dev->needs_free_netdev = false;
rollback_registered(dev);
- rcu_barrier();
-
- dev->reg_state = NETREG_UNREGISTERED;
- /* We should put the kobject that hold in
- * netdev_unregister_kobject(), otherwise
- * the net device cannot be freed when
- * driver calls free_netdev(), because the
- * kobject is being hold.
- */
- kobject_put(&dev->dev.kobj);
+ net_set_todo(dev);
+ goto out;
}
/*
* Prevent userspace races by waiting until the network
@@ -10037,6 +10196,8 @@ int netdev_refcnt_read(const struct net_device *dev)
}
EXPORT_SYMBOL(netdev_refcnt_read);
+#define WAIT_REFS_MIN_MSECS 1
+#define WAIT_REFS_MAX_MSECS 250
/**
* netdev_wait_allrefs - wait until all references are gone.
* @dev: target net_device
@@ -10052,7 +10213,7 @@ EXPORT_SYMBOL(netdev_refcnt_read);
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
- int refcnt;
+ int wait = 0, refcnt;
linkwatch_forget_dev(dev);
@@ -10086,7 +10247,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
rebroadcast_time = jiffies;
}
- msleep(250);
+ if (!wait) {
+ rcu_barrier();
+ wait = WAIT_REFS_MIN_MSECS;
+ } else {
+ msleep(wait);
+ wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
+ }
refcnt = netdev_refcnt_read(dev);
@@ -10134,7 +10301,7 @@ void netdev_run_todo(void)
struct net_device *dev = list_first_entry(&unlink_list,
struct net_device,
unlink_list);
- list_del(&dev->unlink_list);
+ list_del_init(&dev->unlink_list);
dev->nested_level = dev->lower_level - 1;
}
#endif
@@ -10249,6 +10416,55 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
}
EXPORT_SYMBOL(dev_get_stats);
+/**
+ * dev_fetch_sw_netstats - get per-cpu network device statistics
+ * @s: place to store stats
+ * @netstats: per-cpu network stats to read from
+ *
+ * Read per-cpu network statistics and populate the related fields in @s.
+ */
+void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
+ const struct pcpu_sw_netstats __percpu *netstats)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ const struct pcpu_sw_netstats *stats;
+ struct pcpu_sw_netstats tmp;
+ unsigned int start;
+
+ stats = per_cpu_ptr(netstats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ tmp.rx_packets = stats->rx_packets;
+ tmp.rx_bytes = stats->rx_bytes;
+ tmp.tx_packets = stats->tx_packets;
+ tmp.tx_bytes = stats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ s->rx_packets += tmp.rx_packets;
+ s->rx_bytes += tmp.rx_bytes;
+ s->tx_packets += tmp.tx_packets;
+ s->tx_bytes += tmp.tx_bytes;
+ }
+}
+EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
+
+/**
+ * dev_get_tstats64 - ndo_get_stats64 implementation
+ * @dev: device to get statistics from
+ * @s: place to store stats
+ *
+ * Populate @s from dev->stats and dev->tstats. Can be used as
+ * ndo_get_stats64() callback.
+ */
+void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
+{
+ netdev_stats_to_stats64(s, &dev->stats);
+ dev_fetch_sw_netstats(s, dev->tstats);
+}
+EXPORT_SYMBOL_GPL(dev_get_tstats64);
+
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
struct netdev_queue *queue = dev_ingress_queue(dev);
@@ -10421,6 +10637,17 @@ void free_netdev(struct net_device *dev)
struct napi_struct *p, *n;
might_sleep();
+
+ /* When called immediately after register_netdevice() failed the unwind
+ * handling may still be dismantling the device. Handle that case by
+ * deferring the free.
+ */
+ if (dev->reg_state == NETREG_UNREGISTERING) {
+ ASSERT_RTNL();
+ dev->needs_free_netdev = true;
+ return;
+ }
+
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
@@ -11048,8 +11275,7 @@ static int __init net_dev_init(void)
INIT_LIST_HEAD(&sd->poll_list);
sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
- sd->csd.func = rps_trigger_softirq;
- sd->csd.info = sd;
+ INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
sd->cpu = i;
#endif
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 205e92e604ef..db8a0ff86f36 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -230,7 +230,7 @@ static int dev_do_ioctl(struct net_device *dev,
struct ifreq *ifr, unsigned int cmd)
{
const struct net_device_ops *ops = dev->netdev_ops;
- int err = -EOPNOTSUPP;
+ int err;
err = dsa_ndo_do_ioctl(dev, ifr, cmd);
if (err == 0 || err != -EOPNOTSUPP)
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 80ec1cd81c64..738d4344d679 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -27,7 +27,6 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/devlink.h>
-#include <net/drop_monitor.h>
#define CREATE_TRACE_POINTS
#include <trace/events/devlink.h>
@@ -84,6 +83,7 @@ EXPORT_SYMBOL(devlink_dpipe_header_ipv6);
EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
@@ -347,8 +347,12 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
struct devlink_region {
struct devlink *devlink;
+ struct devlink_port *port;
struct list_head list;
- const struct devlink_region_ops *ops;
+ union {
+ const struct devlink_region_ops *ops;
+ const struct devlink_port_region_ops *port_ops;
+ };
struct list_head snapshot_list;
u32 max_snapshots;
u32 cur_snapshots;
@@ -374,6 +378,19 @@ devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
return NULL;
}
+static struct devlink_region *
+devlink_port_region_get_by_name(struct devlink_port *port,
+ const char *region_name)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &port->region_list, list)
+ if (!strcmp(region->ops->name, region_name))
+ return region;
+
+ return NULL;
+}
+
static struct devlink_snapshot *
devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
{
@@ -462,10 +479,132 @@ static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
return 0;
}
+struct devlink_reload_combination {
+ enum devlink_reload_action action;
+ enum devlink_reload_limit limit;
+};
+
+static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = {
+ {
+ /* can't reinitialize driver with no down time */
+ .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ .limit = DEVLINK_RELOAD_LIMIT_NO_RESET,
+ },
+};
+
+static bool
+devlink_reload_combination_is_invalid(enum devlink_reload_action action,
+ enum devlink_reload_limit limit)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++)
+ if (devlink_reload_invalid_combinations[i].action == action &&
+ devlink_reload_invalid_combinations[i].limit == limit)
+ return true;
+ return false;
+}
+
+static bool
+devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action)
+{
+ return test_bit(action, &devlink->ops->reload_actions);
+}
+
+static bool
+devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit)
+{
+ return test_bit(limit, &devlink->ops->reload_limits);
+}
+
+static int devlink_reload_stat_put(struct sk_buff *msg,
+ enum devlink_reload_limit limit, u32 value)
+{
+ struct nlattr *reload_stats_entry;
+
+ reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY);
+ if (!reload_stats_entry)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) ||
+ nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value))
+ goto nla_put_failure;
+ nla_nest_end(msg, reload_stats_entry);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, reload_stats_entry);
+ return -EMSGSIZE;
+}
+
+static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote)
+{
+ struct nlattr *reload_stats_attr, *act_info, *act_stats;
+ int i, j, stat_idx;
+ u32 value;
+
+ if (!is_remote)
+ reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS);
+ else
+ reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS);
+
+ if (!reload_stats_attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) {
+ if ((!is_remote &&
+ !devlink_reload_action_is_supported(devlink, i)) ||
+ i == DEVLINK_RELOAD_ACTION_UNSPEC)
+ continue;
+ act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO);
+ if (!act_info)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i))
+ goto action_info_nest_cancel;
+ act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS);
+ if (!act_stats)
+ goto action_info_nest_cancel;
+
+ for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) {
+ /* Remote stats are shown even if not locally supported.
+ * Stats of actions with unspecified limit are shown
+ * though drivers don't need to register unspecified
+ * limit.
+ */
+ if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC &&
+ !devlink_reload_limit_is_supported(devlink, j)) ||
+ devlink_reload_combination_is_invalid(i, j))
+ continue;
+
+ stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i;
+ if (!is_remote)
+ value = devlink->stats.reload_stats[stat_idx];
+ else
+ value = devlink->stats.remote_reload_stats[stat_idx];
+ if (devlink_reload_stat_put(msg, j, value))
+ goto action_stats_nest_cancel;
+ }
+ nla_nest_end(msg, act_stats);
+ nla_nest_end(msg, act_info);
+ }
+ nla_nest_end(msg, reload_stats_attr);
+ return 0;
+
+action_stats_nest_cancel:
+ nla_nest_cancel(msg, act_stats);
+action_info_nest_cancel:
+ nla_nest_cancel(msg, act_info);
+nla_put_failure:
+ nla_nest_cancel(msg, reload_stats_attr);
+ return -EMSGSIZE;
+}
+
static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
enum devlink_command cmd, u32 portid,
u32 seq, int flags)
{
+ struct nlattr *dev_stats;
void *hdr;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
@@ -477,9 +616,21 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed))
goto nla_put_failure;
+ dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS);
+ if (!dev_stats)
+ goto nla_put_failure;
+
+ if (devlink_reload_stats_put(msg, devlink, false))
+ goto dev_stats_nest_cancel;
+ if (devlink_reload_stats_put(msg, devlink, true))
+ goto dev_stats_nest_cancel;
+
+ nla_nest_end(msg, dev_stats);
genlmsg_end(msg, hdr);
return 0;
+dev_stats_nest_cancel:
+ nla_nest_cancel(msg, dev_stats);
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
@@ -523,15 +674,20 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
return -EMSGSIZE;
switch (devlink_port->attrs.flavour) {
case DEVLINK_PORT_FLAVOUR_PCI_PF:
- if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
- attrs->pci_pf.pf))
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_pf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external))
return -EMSGSIZE;
break;
case DEVLINK_PORT_FLAVOUR_PCI_VF:
- if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
- attrs->pci_vf.pf) ||
- nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER,
- attrs->pci_vf.vf))
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_vf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external))
return -EMSGSIZE;
break;
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
@@ -616,6 +772,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
goto nla_put_failure;
+ /* Hold rtnl lock while accessing port's netdev attributes. */
+ rtnl_lock();
spin_lock_bh(&devlink_port->type_lock);
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
goto nla_put_failure_type_locked;
@@ -624,9 +782,10 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
devlink_port->desired_type))
goto nla_put_failure_type_locked;
if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+ struct net *net = devlink_net(devlink_port->devlink);
struct net_device *netdev = devlink_port->type_dev;
- if (netdev &&
+ if (netdev && net_eq(net, dev_net(netdev)) &&
(nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
netdev->ifindex) ||
nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
@@ -642,6 +801,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
goto nla_put_failure_type_locked;
}
spin_unlock_bh(&devlink_port->type_lock);
+ rtnl_unlock();
if (devlink_nl_port_attrs_put(msg, devlink_port))
goto nla_put_failure;
if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack))
@@ -652,6 +812,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
nla_put_failure_type_locked:
spin_unlock_bh(&devlink_port->type_lock);
+ rtnl_unlock();
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
@@ -806,8 +967,6 @@ static int devlink_port_type_set(struct devlink *devlink,
int err;
if (devlink->ops->port_type_set) {
- if (port_type == DEVLINK_PORT_TYPE_NOTSET)
- return -EINVAL;
if (port_type == devlink_port->type)
return 0;
err = devlink->ops->port_type_set(devlink_port, port_type);
@@ -1311,7 +1470,7 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index,
pool_index, &cur, &max);
if (err && err != -EOPNOTSUPP)
- return err;
+ goto sb_occ_get_failure;
if (!err) {
if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
goto nla_put_failure;
@@ -1324,8 +1483,10 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
return 0;
nla_put_failure:
+ err = -EMSGSIZE;
+sb_occ_get_failure:
genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
+ return err;
}
static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb,
@@ -2943,9 +3104,9 @@ static void devlink_reload_netns_change(struct devlink *devlink,
DEVLINK_CMD_PARAM_NEW);
}
-static bool devlink_reload_supported(const struct devlink *devlink)
+static bool devlink_reload_supported(const struct devlink_ops *ops)
{
- return devlink->ops->reload_down && devlink->ops->reload_up;
+ return ops->reload_down && ops->reload_up;
}
static void devlink_reload_failed_set(struct devlink *devlink,
@@ -2963,33 +3124,132 @@ bool devlink_is_reload_failed(const struct devlink *devlink)
}
EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
+static void
+__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats,
+ enum devlink_reload_limit limit, u32 actions_performed)
+{
+ unsigned long actions = actions_performed;
+ int stat_idx;
+ int action;
+
+ for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) {
+ stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action;
+ reload_stats[stat_idx]++;
+ }
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+static void
+devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit,
+ u32 actions_performed)
+{
+ __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit,
+ actions_performed);
+}
+
+/**
+ * devlink_remote_reload_actions_performed - Update devlink on reload actions
+ * performed which are not a direct result of devlink reload call.
+ *
+ * This should be called by a driver after performing reload actions in case it was not
+ * a result of devlink reload call. For example fw_activate was performed as a result
+ * of devlink reload triggered fw_activate on another host.
+ * The motivation for this function is to keep data on reload actions performed on this
+ * function whether it was done due to direct devlink reload call or not.
+ *
+ * @devlink: devlink
+ * @limit: reload limit
+ * @actions_performed: bitmask of actions performed
+ */
+void devlink_remote_reload_actions_performed(struct devlink *devlink,
+ enum devlink_reload_limit limit,
+ u32 actions_performed)
+{
+ if (WARN_ON(!actions_performed ||
+ actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
+ actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) ||
+ limit > DEVLINK_RELOAD_LIMIT_MAX))
+ return;
+
+ __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit,
+ actions_performed);
+}
+EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed);
+
static int devlink_reload(struct devlink *devlink, struct net *dest_net,
- struct netlink_ext_ack *extack)
+ enum devlink_reload_action action, enum devlink_reload_limit limit,
+ u32 *actions_performed, struct netlink_ext_ack *extack)
{
+ u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
int err;
if (!devlink->reload_enabled)
return -EOPNOTSUPP;
- err = devlink->ops->reload_down(devlink, !!dest_net, extack);
+ memcpy(remote_reload_stats, devlink->stats.remote_reload_stats,
+ sizeof(remote_reload_stats));
+ err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack);
if (err)
return err;
if (dest_net && !net_eq(dest_net, devlink_net(devlink)))
devlink_reload_netns_change(devlink, dest_net);
- err = devlink->ops->reload_up(devlink, extack);
+ err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack);
devlink_reload_failed_set(devlink, !!err);
- return err;
+ if (err)
+ return err;
+
+ WARN_ON(!(*actions_performed & BIT(action)));
+ /* Catch driver on updating the remote action within devlink reload */
+ WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats,
+ sizeof(remote_reload_stats)));
+ devlink_reload_stats_update(devlink, limit, *actions_performed);
+ return 0;
+}
+
+static int
+devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed,
+ enum devlink_command cmd, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ goto free_msg;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed,
+ actions_performed))
+ goto nla_put_failure;
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
}
static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
+ enum devlink_reload_action action;
+ enum devlink_reload_limit limit;
struct net *dest_net = NULL;
+ u32 actions_performed;
int err;
- if (!devlink_reload_supported(devlink))
+ if (!devlink_reload_supported(devlink->ops))
return -EOPNOTSUPP;
err = devlink_resources_validate(devlink, NULL, info);
@@ -3006,20 +3266,67 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
return PTR_ERR(dest_net);
}
- err = devlink_reload(devlink, dest_net, info->extack);
+ if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION])
+ action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]);
+ else
+ action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT;
+
+ if (!devlink_reload_action_is_supported(devlink, action)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Requested reload action is not supported by the driver");
+ return -EOPNOTSUPP;
+ }
+
+ limit = DEVLINK_RELOAD_LIMIT_UNSPEC;
+ if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) {
+ struct nla_bitfield32 limits;
+ u32 limits_selected;
+
+ limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]);
+ limits_selected = limits.value & limits.selector;
+ if (!limits_selected) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected");
+ return -EINVAL;
+ }
+ for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++)
+ if (limits_selected & BIT(limit))
+ break;
+ /* UAPI enables multiselection, but currently it is not used */
+ if (limits_selected != BIT(limit)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Multiselection of limit is not supported");
+ return -EOPNOTSUPP;
+ }
+ if (!devlink_reload_limit_is_supported(devlink, limit)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Requested limit is not supported by the driver");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_reload_combination_is_invalid(action, limit)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Requested limit is invalid for this action");
+ return -EINVAL;
+ }
+ }
+ err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack);
if (dest_net)
put_net(dest_net);
- return err;
+ if (err)
+ return err;
+ /* For backward compatibility generate reply only if attributes used by user */
+ if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS])
+ return 0;
+
+ return devlink_nl_reload_actions_performed_snd(devlink, actions_performed,
+ DEVLINK_CMD_RELOAD, info);
}
static int devlink_nl_flash_update_fill(struct sk_buff *msg,
struct devlink *devlink,
enum devlink_command cmd,
- const char *status_msg,
- const char *component,
- unsigned long done, unsigned long total)
+ struct devlink_flash_notify *params)
{
void *hdr;
@@ -3033,19 +3340,22 @@ static int devlink_nl_flash_update_fill(struct sk_buff *msg,
if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
goto out;
- if (status_msg &&
+ if (params->status_msg &&
nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
- status_msg))
+ params->status_msg))
goto nla_put_failure;
- if (component &&
+ if (params->component &&
nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
- component))
+ params->component))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
- done, DEVLINK_ATTR_PAD))
+ params->done, DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
- total, DEVLINK_ATTR_PAD))
+ params->total, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT,
+ params->timeout, DEVLINK_ATTR_PAD))
goto nla_put_failure;
out:
@@ -3059,10 +3369,7 @@ nla_put_failure:
static void __devlink_flash_update_notify(struct devlink *devlink,
enum devlink_command cmd,
- const char *status_msg,
- const char *component,
- unsigned long done,
- unsigned long total)
+ struct devlink_flash_notify *params)
{
struct sk_buff *msg;
int err;
@@ -3075,8 +3382,7 @@ static void __devlink_flash_update_notify(struct devlink *devlink,
if (!msg)
return;
- err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg,
- component, done, total);
+ err = devlink_nl_flash_update_fill(msg, devlink, cmd, params);
if (err)
goto out_free_msg;
@@ -3088,21 +3394,23 @@ out_free_msg:
nlmsg_free(msg);
}
-void devlink_flash_update_begin_notify(struct devlink *devlink)
+static void devlink_flash_update_begin_notify(struct devlink *devlink)
{
+ struct devlink_flash_notify params = { 0 };
+
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE,
- NULL, NULL, 0, 0);
+ &params);
}
-EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify);
-void devlink_flash_update_end_notify(struct devlink *devlink)
+static void devlink_flash_update_end_notify(struct devlink *devlink)
{
+ struct devlink_flash_notify params = { 0 };
+
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE_END,
- NULL, NULL, 0, 0);
+ &params);
}
-EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify);
void devlink_flash_update_status_notify(struct devlink *devlink,
const char *status_msg,
@@ -3110,31 +3418,92 @@ void devlink_flash_update_status_notify(struct devlink *devlink,
unsigned long done,
unsigned long total)
{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .done = done,
+ .total = total,
+ };
+
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE_STATUS,
- status_msg, component, done, total);
+ &params);
}
EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
+void devlink_flash_update_timeout_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long timeout)
+{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .timeout = timeout,
+ };
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ &params);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify);
+
static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
struct genl_info *info)
{
+ struct nlattr *nla_component, *nla_overwrite_mask, *nla_file_name;
+ struct devlink_flash_update_params params = {};
struct devlink *devlink = info->user_ptr[0];
- const char *file_name, *component;
- struct nlattr *nla_component;
+ const char *file_name;
+ u32 supported_params;
+ int ret;
if (!devlink->ops->flash_update)
return -EOPNOTSUPP;
if (!info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME])
return -EINVAL;
- file_name = nla_data(info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]);
+
+ supported_params = devlink->ops->supported_flash_update_params;
nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT];
- component = nla_component ? nla_data(nla_component) : NULL;
+ if (nla_component) {
+ if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_component,
+ "component update is not supported by this device");
+ return -EOPNOTSUPP;
+ }
+ params.component = nla_data(nla_component);
+ }
+
+ nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK];
+ if (nla_overwrite_mask) {
+ struct nla_bitfield32 sections;
+
+ if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask,
+ "overwrite settings are not supported by this device");
+ return -EOPNOTSUPP;
+ }
+ sections = nla_get_bitfield32(nla_overwrite_mask);
+ params.overwrite_mask = sections.value & sections.selector;
+ }
+
+ nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME];
+ file_name = nla_data(nla_file_name);
+ ret = request_firmware(&params.fw, file_name, devlink->dev);
+ if (ret) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file");
+ return ret;
+ }
+
+ devlink_flash_update_begin_notify(devlink);
+ ret = devlink->ops->flash_update(devlink, &params, info->extack);
+ devlink_flash_update_end_notify(devlink);
- return devlink->ops->flash_update(devlink, file_name, component,
- info->extack);
+ release_firmware(params.fw);
+
+ return ret;
}
static const struct devlink_param devlink_param_generic[] = {
@@ -3188,6 +3557,11 @@ static const struct devlink_param devlink_param_generic[] = {
.name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE,
},
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE,
+ },
};
static int devlink_param_generic_verify(const struct devlink_param *param)
@@ -3772,7 +4146,7 @@ out:
static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink_param_item *param_item;
struct sk_buff *msg;
int err;
@@ -3801,7 +4175,7 @@ static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb,
static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink_port *devlink_port = info->user_ptr[1];
return __devlink_nl_cmd_param_set_doit(devlink_port->devlink,
devlink_port->index,
@@ -3875,6 +4249,13 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
if (err)
goto nla_put_failure;
+ if (region->port) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto nla_put_failure;
+ }
+
err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name);
if (err)
goto nla_put_failure;
@@ -3922,6 +4303,13 @@ devlink_nl_region_notify_build(struct devlink_region *region,
if (err)
goto out_cancel_msg;
+ if (region->port) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto out_cancel_msg;
+ }
+
err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
region->ops->name);
if (err)
@@ -4168,16 +4556,30 @@ static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
+ struct devlink_port *port = NULL;
struct devlink_region *region;
const char *region_name;
struct sk_buff *msg;
+ unsigned int index;
int err;
if (!info->attrs[DEVLINK_ATTR_REGION_NAME])
return -EINVAL;
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
- region = devlink_region_get_by_name(devlink, region_name);
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
if (!region)
return -EINVAL;
@@ -4196,10 +4598,75 @@ static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
return genlmsg_reply(msg, info);
}
+static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ struct devlink_port *port,
+ int *idx,
+ int start)
+{
+ struct devlink_region *region;
+ int err = 0;
+
+ list_for_each_entry(region, &port->region_list, list) {
+ if (*idx < start) {
+ (*idx)++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, port->devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, region);
+ if (err)
+ goto out;
+ (*idx)++;
+ }
+
+out:
+ return err;
+}
+
+static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ struct devlink *devlink,
+ int *idx,
+ int start)
+{
+ struct devlink_region *region;
+ struct devlink_port *port;
+ int err = 0;
+
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(region, &devlink->region_list, list) {
+ if (*idx < start) {
+ (*idx)++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, region);
+ if (err)
+ goto out;
+ (*idx)++;
+ }
+
+ list_for_each_entry(port, &devlink->port_list, list) {
+ err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, idx,
+ start);
+ if (err)
+ goto out;
+ }
+
+out:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+
static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
struct netlink_callback *cb)
{
- struct devlink_region *region;
struct devlink *devlink;
int start = cb->args[0];
int idx = 0;
@@ -4209,25 +4676,10 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
list_for_each_entry(devlink, &devlink_list, list) {
if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
continue;
-
- mutex_lock(&devlink->lock);
- list_for_each_entry(region, &devlink->region_list, list) {
- if (idx < start) {
- idx++;
- continue;
- }
- err = devlink_nl_region_fill(msg, devlink,
- DEVLINK_CMD_REGION_GET,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI, region);
- if (err) {
- mutex_unlock(&devlink->lock);
- goto out;
- }
- idx++;
- }
- mutex_unlock(&devlink->lock);
+ err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink,
+ &idx, start);
+ if (err)
+ goto out;
}
out:
mutex_unlock(&devlink_mutex);
@@ -4240,8 +4692,10 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb,
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_snapshot *snapshot;
+ struct devlink_port *port = NULL;
struct devlink_region *region;
const char *region_name;
+ unsigned int index;
u32 snapshot_id;
if (!info->attrs[DEVLINK_ATTR_REGION_NAME] ||
@@ -4251,7 +4705,19 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb,
region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
- region = devlink_region_get_by_name(devlink, region_name);
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
if (!region)
return -EINVAL;
@@ -4268,9 +4734,11 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_snapshot *snapshot;
+ struct devlink_port *port = NULL;
struct nlattr *snapshot_id_attr;
struct devlink_region *region;
const char *region_name;
+ unsigned int index;
u32 snapshot_id;
u8 *data;
int err;
@@ -4281,7 +4749,20 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
}
region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
- region = devlink_region_get_by_name(devlink, region_name);
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
if (!region) {
NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist");
return -EINVAL;
@@ -4317,7 +4798,12 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
}
}
- err = region->ops->snapshot(devlink, info->extack, &data);
+ if (port)
+ err = region->port_ops->snapshot(port, region->port_ops,
+ info->extack, &data);
+ else
+ err = region->ops->snapshot(devlink, region->ops,
+ info->extack, &data);
if (err)
goto err_snapshot_capture;
@@ -4439,10 +4925,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
const struct genl_dumpit_info *info = genl_dumpit_info(cb);
u64 ret_offset, start_offset, end_offset = U64_MAX;
struct nlattr **attrs = info->attrs;
+ struct devlink_port *port = NULL;
struct devlink_region *region;
struct nlattr *chunks_attr;
const char *region_name;
struct devlink *devlink;
+ unsigned int index;
void *hdr;
int err;
@@ -4463,8 +4951,23 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
goto out_unlock;
}
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]);
- region = devlink_region_get_by_name(devlink, region_name);
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
if (!region) {
err = -EINVAL;
goto out_unlock;
@@ -4501,6 +5004,13 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
if (err)
goto nla_put_failure;
+ if (region->port) {
+ err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto nla_put_failure;
+ }
+
err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
if (err)
goto nla_put_failure;
@@ -5895,6 +6405,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
list_for_each_entry(devlink, &devlink_list, list) {
if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
continue;
+ mutex_lock(&devlink->lock);
list_for_each_entry(port, &devlink->port_list, list) {
mutex_lock(&port->reporters_lock);
list_for_each_entry(reporter, &port->reporter_list, list) {
@@ -5909,12 +6420,14 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
NLM_F_MULTI);
if (err) {
mutex_unlock(&port->reporters_lock);
+ mutex_unlock(&devlink->lock);
goto out;
}
idx++;
}
mutex_unlock(&port->reporters_lock);
}
+ mutex_unlock(&devlink->lock);
}
out:
mutex_unlock(&devlink_mutex);
@@ -6088,6 +6601,28 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb,
return 0;
}
+static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+ int err;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->test) {
+ devlink_health_reporter_put(reporter);
+ return -EOPNOTSUPP;
+ }
+
+ err = reporter->ops->test(reporter, info->extack);
+
+ devlink_health_reporter_put(reporter);
+ return err;
+}
+
struct devlink_stats {
u64 rx_bytes;
u64 rx_packets;
@@ -6458,7 +6993,6 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb,
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct devlink_trap_item *trap_item;
- int err;
if (list_empty(&devlink->trap_list))
return -EOPNOTSUPP;
@@ -6469,11 +7003,7 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb,
return -ENOENT;
}
- err = devlink_trap_action_set(devlink, trap_item, info);
- if (err)
- return err;
-
- return 0;
+ return devlink_trap_action_set(devlink, trap_item, info);
}
static struct devlink_trap_group_item *
@@ -6644,6 +7174,24 @@ __devlink_trap_group_action_set(struct devlink *devlink,
struct devlink_trap_item *trap_item;
int err;
+ if (devlink->ops->trap_group_action_set) {
+ err = devlink->ops->trap_group_action_set(devlink, group_item->group,
+ trap_action, extack);
+ if (err)
+ return err;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (strcmp(trap_item->group_item->group->name, group_name))
+ continue;
+ if (trap_item->action != trap_action &&
+ trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP)
+ continue;
+ trap_item->action = trap_action;
+ }
+
+ return 0;
+ }
+
list_for_each_entry(trap_item, &devlink->trap_list, list) {
if (strcmp(trap_item->group_item->group->name, group_name))
continue;
@@ -7000,7 +7548,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
- [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_PORT_TYPE_AUTO,
+ DEVLINK_PORT_TYPE_IB),
[DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 },
@@ -7009,7 +7558,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 },
[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
- [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_ESWITCH_MODE_LEGACY,
+ DEVLINK_ESWITCH_MODE_SWITCHDEV),
[DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 },
[DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 },
[DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING },
@@ -7028,6 +7578,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] =
+ NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS),
[DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 },
[DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING },
@@ -7039,9 +7591,12 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 },
[DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 },
[DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED },
+ [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ DEVLINK_RELOAD_ACTION_MAX),
+ [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK),
};
-static const struct genl_ops devlink_nl_ops[] = {
+static const struct genl_small_ops devlink_nl_ops[] = {
{
.cmd = DEVLINK_CMD_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -7309,6 +7864,14 @@ static const struct genl_ops devlink_nl_ops[] = {
DEVLINK_NL_FLAG_NO_LOCK,
},
{
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = devlink_nl_cmd_health_reporter_test_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT |
+ DEVLINK_NL_FLAG_NO_LOCK,
+ },
+ {
.cmd = DEVLINK_CMD_FLASH_UPDATE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_flash_update,
@@ -7358,12 +7921,41 @@ static struct genl_family devlink_nl_family __ro_after_init = {
.pre_doit = devlink_nl_pre_doit,
.post_doit = devlink_nl_post_doit,
.module = THIS_MODULE,
- .ops = devlink_nl_ops,
- .n_ops = ARRAY_SIZE(devlink_nl_ops),
+ .small_ops = devlink_nl_ops,
+ .n_small_ops = ARRAY_SIZE(devlink_nl_ops),
.mcgrps = devlink_nl_mcgrps,
.n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
};
+static bool devlink_reload_actions_valid(const struct devlink_ops *ops)
+{
+ const struct devlink_reload_combination *comb;
+ int i;
+
+ if (!devlink_reload_supported(ops)) {
+ if (WARN_ON(ops->reload_actions))
+ return false;
+ return true;
+ }
+
+ if (WARN_ON(!ops->reload_actions ||
+ ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
+ ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX)))
+ return false;
+
+ if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) ||
+ ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX)))
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) {
+ comb = &devlink_reload_invalid_combinations[i];
+ if (ops->reload_actions == BIT(comb->action) &&
+ ops->reload_limits == BIT(comb->limit))
+ return false;
+ }
+ return true;
+}
+
/**
* devlink_alloc - Allocate new devlink instance resources
*
@@ -7380,6 +7972,9 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
if (WARN_ON(!ops))
return NULL;
+ if (!devlink_reload_actions_valid(ops))
+ return NULL;
+
devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
if (!devlink)
return NULL;
@@ -7428,7 +8023,7 @@ EXPORT_SYMBOL_GPL(devlink_register);
void devlink_unregister(struct devlink *devlink)
{
mutex_lock(&devlink_mutex);
- WARN_ON(devlink_reload_supported(devlink) &&
+ WARN_ON(devlink_reload_supported(devlink->ops) &&
devlink->reload_enabled);
devlink_notify(devlink, DEVLINK_CMD_DEL);
list_del(&devlink->list);
@@ -7506,7 +8101,8 @@ static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
{
/* Ignore CPU and DSA flavours. */
return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
- devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA;
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED;
}
#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600)
@@ -7555,11 +8151,12 @@ int devlink_port_register(struct devlink *devlink,
devlink_port->index = port_index;
devlink_port->registered = true;
spin_lock_init(&devlink_port->type_lock);
+ INIT_LIST_HEAD(&devlink_port->reporter_list);
+ mutex_init(&devlink_port->reporters_lock);
list_add_tail(&devlink_port->list, &devlink->port_list);
INIT_LIST_HEAD(&devlink_port->param_list);
+ INIT_LIST_HEAD(&devlink_port->region_list);
mutex_unlock(&devlink->lock);
- INIT_LIST_HEAD(&devlink_port->reporter_list);
- mutex_init(&devlink_port->reporters_lock);
INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
devlink_port_type_warn_schedule(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
@@ -7576,13 +8173,14 @@ void devlink_port_unregister(struct devlink_port *devlink_port)
{
struct devlink *devlink = devlink_port->devlink;
- WARN_ON(!list_empty(&devlink_port->reporter_list));
- mutex_destroy(&devlink_port->reporters_lock);
devlink_port_type_warn_cancel(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
mutex_lock(&devlink->lock);
list_del(&devlink_port->list);
mutex_unlock(&devlink->lock);
+ WARN_ON(!list_empty(&devlink_port->reporter_list));
+ WARN_ON(!list_empty(&devlink_port->region_list));
+ mutex_destroy(&devlink_port->reporters_lock);
}
EXPORT_SYMBOL_GPL(devlink_port_unregister);
@@ -7600,14 +8198,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
}
-/**
- * devlink_port_type_eth_set - Set port type to Ethernet
- *
- * @devlink_port: devlink port
- * @netdev: related netdevice
- */
-void devlink_port_type_eth_set(struct devlink_port *devlink_port,
- struct net_device *netdev)
+static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port,
+ struct net_device *netdev)
{
const struct net_device_ops *ops = netdev->netdev_ops;
@@ -7641,6 +8233,24 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port,
err = ops->ndo_get_port_parent_id(netdev, &ppid);
WARN_ON(err != -EOPNOTSUPP);
}
+}
+
+/**
+ * devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ * @devlink_port: devlink port
+ * @netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+ struct net_device *netdev)
+{
+ if (netdev)
+ devlink_port_type_netdev_checks(devlink_port, netdev);
+ else
+ dev_warn(devlink_port->devlink->dev,
+ "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
+ devlink_port->index);
+
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev);
}
EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
@@ -7675,8 +8285,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
- if (WARN_ON(devlink_port->registered))
- return -EEXIST;
devlink_port->attrs_set = true;
attrs->flavour = flavour;
if (attrs->switch_id.id_len) {
@@ -7700,6 +8308,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
{
int ret;
+ if (WARN_ON(devlink_port->registered))
+ return;
devlink_port->attrs = *attrs;
ret = __devlink_port_attrs_set(devlink_port, attrs->flavour);
if (ret)
@@ -7712,19 +8322,25 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
* devlink_port_attrs_pci_pf_set - Set PCI PF port attributes
*
* @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
* @pf: associated PF for the devlink port instance
+ * @external: indicates if the port is for an external controller
*/
-void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf)
+void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
+ if (WARN_ON(devlink_port->registered))
+ return;
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_PF);
if (ret)
return;
-
+ attrs->pci_pf.controller = controller;
attrs->pci_pf.pf = pf;
+ attrs->pci_pf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
@@ -7732,21 +8348,27 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
* devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
*
* @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
* @pf: associated PF for the devlink port instance
* @vf: associated VF of a PF for the devlink port instance
+ * @external: indicates if the port is for an external controller
*/
-void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port,
- u16 pf, u16 vf)
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, u16 vf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
+ if (WARN_ON(devlink_port->registered))
+ return;
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_VF);
if (ret)
return;
+ attrs->pci_vf.controller = controller;
attrs->pci_vf.pf = pf;
attrs->pci_vf.vf = vf;
+ attrs->pci_vf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
@@ -7771,15 +8393,30 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
break;
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
+ case DEVLINK_PORT_FLAVOUR_UNUSED:
/* As CPU and DSA ports do not have a netdevice associated
* case should not ever happen.
*/
WARN_ON(1);
return -EINVAL;
case DEVLINK_PORT_FLAVOUR_PCI_PF:
+ if (attrs->pci_pf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_pf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
break;
case DEVLINK_PORT_FLAVOUR_PCI_VF:
+ if (attrs->pci_vf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_vf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
n = snprintf(name, len, "pf%uvf%u",
attrs->pci_vf.pf, attrs->pci_vf.vf);
break;
@@ -8431,7 +9068,7 @@ __devlink_param_driverinit_value_set(struct devlink *devlink,
int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
union devlink_param_value *init_val)
{
- if (!devlink_reload_supported(devlink))
+ if (!devlink_reload_supported(devlink->ops))
return -EOPNOTSUPP;
return __devlink_param_driverinit_value_get(&devlink->param_list,
@@ -8478,7 +9115,7 @@ int devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port,
{
struct devlink *devlink = devlink_port->devlink;
- if (!devlink_reload_supported(devlink))
+ if (!devlink_reload_supported(devlink->ops))
return -EOPNOTSUPP;
return __devlink_param_driverinit_value_get(&devlink_port->param_list,
@@ -8627,6 +9264,57 @@ unlock:
EXPORT_SYMBOL_GPL(devlink_region_create);
/**
+ * devlink_port_region_create - create a new address region for a port
+ *
+ * @port: devlink port
+ * @ops: region operations and name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ */
+struct devlink_region *
+devlink_port_region_create(struct devlink_port *port,
+ const struct devlink_port_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ struct devlink *devlink = port->devlink;
+ struct devlink_region *region;
+ int err = 0;
+
+ if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
+ return ERR_PTR(-EINVAL);
+
+ mutex_lock(&devlink->lock);
+
+ if (devlink_port_region_get_by_name(port, ops->name)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ region->devlink = devlink;
+ region->port = port;
+ region->max_snapshots = region_max_snapshots;
+ region->port_ops = ops;
+ region->size = region_size;
+ INIT_LIST_HEAD(&region->snapshot_list);
+ list_add_tail(&region->list, &port->region_list);
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+
+ mutex_unlock(&devlink->lock);
+ return region;
+
+unlock:
+ mutex_unlock(&devlink->lock);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(devlink_port_region_create);
+
+/**
* devlink_region_destroy - destroy address region
*
* @region: devlink region to destroy
@@ -8803,6 +9491,23 @@ static const struct devlink_trap devlink_trap_generic[] = {
DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL),
DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL),
DEVLINK_TRAP(EARLY_DROP, DROP),
+ DEVLINK_TRAP(VXLAN_PARSING, DROP),
+ DEVLINK_TRAP(LLC_SNAP_PARSING, DROP),
+ DEVLINK_TRAP(VLAN_PARSING, DROP),
+ DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP),
+ DEVLINK_TRAP(MPLS_PARSING, DROP),
+ DEVLINK_TRAP(ARP_PARSING, DROP),
+ DEVLINK_TRAP(IP_1_PARSING, DROP),
+ DEVLINK_TRAP(IP_N_PARSING, DROP),
+ DEVLINK_TRAP(GRE_PARSING, DROP),
+ DEVLINK_TRAP(UDP_PARSING, DROP),
+ DEVLINK_TRAP(TCP_PARSING, DROP),
+ DEVLINK_TRAP(IPSEC_PARSING, DROP),
+ DEVLINK_TRAP(SCTP_PARSING, DROP),
+ DEVLINK_TRAP(DCCP_PARSING, DROP),
+ DEVLINK_TRAP(GTP_PARSING, DROP),
+ DEVLINK_TRAP(ESP_PARSING, DROP),
+ DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP),
};
#define DEVLINK_TRAP_GROUP(_id) \
@@ -8837,6 +9542,7 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = {
DEVLINK_TRAP_GROUP(PTP_GENERAL),
DEVLINK_TRAP_GROUP(ACL_SAMPLE),
DEVLINK_TRAP_GROUP(ACL_TRAP),
+ DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS),
};
static int devlink_trap_generic_verify(const struct devlink_trap *trap)
@@ -9139,20 +9845,19 @@ devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats,
}
static void
-devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata,
- const struct devlink_trap_item *trap_item,
- struct devlink_port *in_devlink_port,
- const struct flow_action_cookie *fa_cookie)
+devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata,
+ const struct devlink_trap_item *trap_item,
+ struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
{
- struct devlink_trap_group_item *group_item = trap_item->group_item;
-
- hw_metadata->trap_group_name = group_item->group->name;
- hw_metadata->trap_name = trap_item->trap->name;
- hw_metadata->fa_cookie = fa_cookie;
+ metadata->trap_name = trap_item->trap->name;
+ metadata->trap_group_name = trap_item->group_item->group->name;
+ metadata->fa_cookie = fa_cookie;
+ metadata->trap_type = trap_item->trap->type;
spin_lock(&in_devlink_port->type_lock);
if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
- hw_metadata->input_dev = in_devlink_port->type_dev;
+ metadata->input_dev = in_devlink_port->type_dev;
spin_unlock(&in_devlink_port->type_lock);
}
@@ -9170,21 +9875,17 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
{
struct devlink_trap_item *trap_item = trap_ctx;
- struct net_dm_hw_metadata hw_metadata = {};
devlink_trap_stats_update(trap_item->stats, skb->len);
devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
- /* Control packets were not dropped by the device or encountered an
- * exception during forwarding and therefore should not be reported to
- * the kernel's drop monitor.
- */
- if (trap_item->trap->type == DEVLINK_TRAP_TYPE_CONTROL)
- return;
+ if (trace_devlink_trap_report_enabled()) {
+ struct devlink_trap_metadata metadata = {};
- devlink_trap_report_metadata_fill(&hw_metadata, trap_item,
- in_devlink_port, fa_cookie);
- net_dm_hw_report(skb, &hw_metadata);
+ devlink_trap_report_metadata_set(&metadata, trap_item,
+ in_devlink_port, fa_cookie);
+ trace_devlink_trap_report(devlink, skb, &metadata);
+ }
}
EXPORT_SYMBOL_GPL(devlink_trap_report);
@@ -9543,6 +10244,7 @@ out:
int devlink_compat_flash_update(struct net_device *dev, const char *file_name)
{
+ struct devlink_flash_update_params params = {};
struct devlink *devlink;
int ret;
@@ -9555,10 +10257,18 @@ int devlink_compat_flash_update(struct net_device *dev, const char *file_name)
goto out;
}
+ ret = request_firmware(&params.fw, file_name, devlink->dev);
+ if (ret)
+ goto out;
+
mutex_lock(&devlink->lock);
- ret = devlink->ops->flash_update(devlink, file_name, NULL, NULL);
+ devlink_flash_update_begin_notify(devlink);
+ ret = devlink->ops->flash_update(devlink, &params, NULL);
+ devlink_flash_update_end_notify(devlink);
mutex_unlock(&devlink->lock);
+ release_firmware(params.fw);
+
out:
rtnl_lock();
dev_put(dev);
@@ -9605,6 +10315,7 @@ int devlink_compat_switch_id_get(struct net_device *dev,
static void __net_exit devlink_pernet_pre_exit(struct net *net)
{
struct devlink *devlink;
+ u32 actions_performed;
int err;
/* In case network namespace is getting destroyed, reload
@@ -9613,9 +10324,12 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
mutex_lock(&devlink_mutex);
list_for_each_entry(devlink, &devlink_list, list) {
if (net_eq(devlink_net(devlink), net)) {
- if (WARN_ON(!devlink_reload_supported(devlink)))
+ if (WARN_ON(!devlink_reload_supported(devlink->ops)))
continue;
- err = devlink_reload(devlink, &init_net, NULL);
+ err = devlink_reload(devlink, &init_net,
+ DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ DEVLINK_RELOAD_LIMIT_UNSPEC,
+ &actions_performed, NULL);
if (err && err != -EOPNOTSUPP)
pr_warn("Failed to reload devlink instance into init_net\n");
}
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 9704522b0872..571f191c06d9 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -26,13 +26,14 @@
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/module.h>
-#include <net/drop_monitor.h>
#include <net/genetlink.h>
#include <net/netevent.h>
#include <net/flow_offload.h>
+#include <net/devlink.h>
#include <trace/events/skb.h>
#include <trace/events/napi.h>
+#include <trace/events/devlink.h>
#include <asm/unaligned.h>
@@ -114,13 +115,14 @@ struct net_dm_alert_ops {
int work, int budget);
void (*work_item_func)(struct work_struct *work);
void (*hw_work_item_func)(struct work_struct *work);
- void (*hw_probe)(struct sk_buff *skb,
- const struct net_dm_hw_metadata *hw_metadata);
+ void (*hw_trap_probe)(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata);
};
struct net_dm_skb_cb {
union {
- struct net_dm_hw_metadata *hw_metadata;
+ struct devlink_trap_metadata *hw_metadata;
void *pc;
};
};
@@ -432,8 +434,9 @@ out:
}
static void
-net_dm_hw_summary_probe(struct sk_buff *skb,
- const struct net_dm_hw_metadata *hw_metadata)
+net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata)
{
struct net_dm_hw_entries *hw_entries;
struct net_dm_hw_entry *hw_entry;
@@ -441,6 +444,9 @@ net_dm_hw_summary_probe(struct sk_buff *skb,
unsigned long flags;
int i;
+ if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL)
+ return;
+
hw_data = this_cpu_ptr(&dm_hw_cpu_data);
spin_lock_irqsave(&hw_data->lock, flags);
hw_entries = hw_data->hw_entries;
@@ -450,7 +456,7 @@ net_dm_hw_summary_probe(struct sk_buff *skb,
for (i = 0; i < hw_entries->num_entries; i++) {
hw_entry = &hw_entries->entries[i];
- if (!strncmp(hw_entry->trap_name, hw_metadata->trap_name,
+ if (!strncmp(hw_entry->trap_name, metadata->trap_name,
NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) {
hw_entry->count++;
goto out;
@@ -460,7 +466,7 @@ net_dm_hw_summary_probe(struct sk_buff *skb,
goto out;
hw_entry = &hw_entries->entries[hw_entries->num_entries];
- strlcpy(hw_entry->trap_name, hw_metadata->trap_name,
+ strlcpy(hw_entry->trap_name, metadata->trap_name,
NET_DM_MAX_HW_TRAP_NAME_LEN - 1);
hw_entry->count = 1;
hw_entries->num_entries++;
@@ -479,7 +485,7 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
.napi_poll_probe = trace_napi_poll_hit,
.work_item_func = send_dm_alert,
.hw_work_item_func = net_dm_hw_summary_work,
- .hw_probe = net_dm_hw_summary_probe,
+ .hw_trap_probe = net_dm_hw_trap_summary_probe,
};
static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
@@ -705,7 +711,7 @@ static void net_dm_packet_work(struct work_struct *work)
}
static size_t
-net_dm_flow_action_cookie_size(const struct net_dm_hw_metadata *hw_metadata)
+net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata)
{
return hw_metadata->fa_cookie ?
nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0;
@@ -713,7 +719,7 @@ net_dm_flow_action_cookie_size(const struct net_dm_hw_metadata *hw_metadata)
static size_t
net_dm_hw_packet_report_size(size_t payload_len,
- const struct net_dm_hw_metadata *hw_metadata)
+ const struct devlink_trap_metadata *hw_metadata)
{
size_t size;
@@ -743,7 +749,7 @@ net_dm_hw_packet_report_size(size_t payload_len,
static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
struct sk_buff *skb, size_t payload_len)
{
- struct net_dm_hw_metadata *hw_metadata;
+ struct devlink_trap_metadata *hw_metadata;
struct nlattr *attr;
void *hdr;
@@ -810,56 +816,56 @@ nla_put_failure:
return -EMSGSIZE;
}
-static struct net_dm_hw_metadata *
-net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata)
+static struct devlink_trap_metadata *
+net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata)
{
const struct flow_action_cookie *fa_cookie;
- struct net_dm_hw_metadata *n_hw_metadata;
+ struct devlink_trap_metadata *hw_metadata;
const char *trap_group_name;
const char *trap_name;
- n_hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC);
- if (!n_hw_metadata)
+ hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC);
+ if (!hw_metadata)
return NULL;
- trap_group_name = kstrdup(hw_metadata->trap_group_name, GFP_ATOMIC);
+ trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC);
if (!trap_group_name)
goto free_hw_metadata;
- n_hw_metadata->trap_group_name = trap_group_name;
+ hw_metadata->trap_group_name = trap_group_name;
- trap_name = kstrdup(hw_metadata->trap_name, GFP_ATOMIC);
+ trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC);
if (!trap_name)
goto free_trap_group;
- n_hw_metadata->trap_name = trap_name;
+ hw_metadata->trap_name = trap_name;
- if (hw_metadata->fa_cookie) {
+ if (metadata->fa_cookie) {
size_t cookie_size = sizeof(*fa_cookie) +
- hw_metadata->fa_cookie->cookie_len;
+ metadata->fa_cookie->cookie_len;
- fa_cookie = kmemdup(hw_metadata->fa_cookie, cookie_size,
+ fa_cookie = kmemdup(metadata->fa_cookie, cookie_size,
GFP_ATOMIC);
if (!fa_cookie)
goto free_trap_name;
- n_hw_metadata->fa_cookie = fa_cookie;
+ hw_metadata->fa_cookie = fa_cookie;
}
- n_hw_metadata->input_dev = hw_metadata->input_dev;
- if (n_hw_metadata->input_dev)
- dev_hold(n_hw_metadata->input_dev);
+ hw_metadata->input_dev = metadata->input_dev;
+ if (hw_metadata->input_dev)
+ dev_hold(hw_metadata->input_dev);
- return n_hw_metadata;
+ return hw_metadata;
free_trap_name:
kfree(trap_name);
free_trap_group:
kfree(trap_group_name);
free_hw_metadata:
- kfree(n_hw_metadata);
+ kfree(hw_metadata);
return NULL;
}
static void
-net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata)
+net_dm_hw_metadata_free(const struct devlink_trap_metadata *hw_metadata)
{
if (hw_metadata->input_dev)
dev_put(hw_metadata->input_dev);
@@ -871,7 +877,7 @@ net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata)
static void net_dm_hw_packet_report(struct sk_buff *skb)
{
- struct net_dm_hw_metadata *hw_metadata;
+ struct devlink_trap_metadata *hw_metadata;
struct sk_buff *msg;
size_t payload_len;
int rc;
@@ -924,15 +930,19 @@ static void net_dm_hw_packet_work(struct work_struct *work)
}
static void
-net_dm_hw_packet_probe(struct sk_buff *skb,
- const struct net_dm_hw_metadata *hw_metadata)
+net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata)
{
- struct net_dm_hw_metadata *n_hw_metadata;
+ struct devlink_trap_metadata *n_hw_metadata;
ktime_t tstamp = ktime_get_real();
struct per_cpu_dm_data *hw_data;
struct sk_buff *nskb;
unsigned long flags;
+ if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL)
+ return;
+
if (!skb_mac_header_was_set(skb))
return;
@@ -940,7 +950,7 @@ net_dm_hw_packet_probe(struct sk_buff *skb,
if (!nskb)
return;
- n_hw_metadata = net_dm_hw_metadata_clone(hw_metadata);
+ n_hw_metadata = net_dm_hw_metadata_copy(metadata);
if (!n_hw_metadata)
goto free;
@@ -975,7 +985,7 @@ static const struct net_dm_alert_ops net_dm_alert_packet_ops = {
.napi_poll_probe = net_dm_packet_trace_napi_poll_hit,
.work_item_func = net_dm_packet_work,
.hw_work_item_func = net_dm_hw_packet_work,
- .hw_probe = net_dm_hw_packet_probe,
+ .hw_trap_probe = net_dm_hw_trap_packet_probe,
};
static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
@@ -983,25 +993,32 @@ static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
[NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops,
};
-void net_dm_hw_report(struct sk_buff *skb,
- const struct net_dm_hw_metadata *hw_metadata)
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops)
{
- rcu_read_lock();
-
- if (!monitor_hw)
- goto out;
+ return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL);
+}
- net_dm_alert_ops_arr[net_dm_alert_mode]->hw_probe(skb, hw_metadata);
+static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops)
+{
+ unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL);
+ tracepoint_synchronize_unregister();
+}
+#else
+static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops)
+{
+ return -EOPNOTSUPP;
+}
-out:
- rcu_read_unlock();
+static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops)
+{
}
-EXPORT_SYMBOL_GPL(net_dm_hw_report);
+#endif
static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack)
{
const struct net_dm_alert_ops *ops;
- int cpu;
+ int cpu, rc;
if (monitor_hw) {
NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled");
@@ -1025,13 +1042,24 @@ static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack)
kfree(hw_entries);
}
+ rc = net_dm_hw_probe_register(ops);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint");
+ goto err_module_put;
+ }
+
monitor_hw = true;
return 0;
+
+err_module_put:
+ module_put(THIS_MODULE);
+ return rc;
}
static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
{
+ const struct net_dm_alert_ops *ops;
int cpu;
if (!monitor_hw) {
@@ -1039,12 +1067,11 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
return;
}
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
monitor_hw = false;
- /* After this call returns we are guaranteed that no CPU is processing
- * any hardware drops.
- */
- synchronize_rcu();
+ net_dm_hw_probe_unregister(ops);
for_each_possible_cpu(cpu) {
struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
@@ -1053,7 +1080,7 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
del_timer_sync(&hw_data->send_timer);
cancel_work_sync(&hw_data->dm_alert_work);
while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
- struct net_dm_hw_metadata *hw_metadata;
+ struct devlink_trap_metadata *hw_metadata;
hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
net_dm_hw_metadata_free(hw_metadata);
@@ -1548,7 +1575,7 @@ static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
[NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG },
};
-static const struct genl_ops dropmon_ops[] = {
+static const struct genl_small_ops dropmon_ops[] = {
{
.cmd = NET_DM_CMD_CONFIG,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -1598,8 +1625,8 @@ static struct genl_family net_drop_monitor_family __ro_after_init = {
.pre_doit = net_dm_nl_pre_doit,
.post_doit = net_dm_nl_post_doit,
.module = THIS_MODULE,
- .ops = dropmon_ops,
- .n_ops = ARRAY_SIZE(dropmon_ops),
+ .small_ops = dropmon_ops,
+ .n_small_ops = ARRAY_SIZE(dropmon_ops),
.mcgrps = dropmon_mcgrps,
.n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
};
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 7bcfb16854cb..cd80ffed6d26 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -563,7 +563,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net_device *dev;
nlrule->iifindex = -1;
- nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+ nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
dev = __dev_get_by_name(net, nlrule->iifname);
if (dev)
nlrule->iifindex = dev->ifindex;
@@ -573,7 +573,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net_device *dev;
nlrule->oifindex = -1;
- nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+ nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
dev = __dev_get_by_name(net, nlrule->oifname);
if (dev)
nlrule->oifindex = dev->ifindex;
diff --git a/net/core/filter.c b/net/core/filter.c
index b5f3faac5e3b..255aeee72402 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -76,6 +76,10 @@
#include <net/bpf_sk_storage.h>
#include <net/transp_v6.h>
#include <linux/btf_ids.h>
+#include <net/tls.h>
+
+static const struct bpf_func_proto *
+bpf_sk_base_func_proto(enum bpf_func_id func_id);
int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
{
@@ -2160,13 +2164,266 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
return __bpf_redirect_no_mac(skb, dev, flags);
}
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
+ struct net_device *dev, struct bpf_nh_params *nh)
+{
+ u32 hh_len = LL_RESERVED_SPACE(dev);
+ const struct in6_addr *nexthop;
+ struct dst_entry *dst = NULL;
+ struct neighbour *neigh;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ goto out_drop;
+ }
+
+ skb->dev = dev;
+ skb->tstamp = 0;
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_realloc_headroom(skb, hh_len);
+ if (unlikely(!skb2)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+ consume_skb(skb);
+ skb = skb2;
+ }
+
+ rcu_read_lock_bh();
+ if (!nh) {
+ dst = skb_dst(skb);
+ nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
+ &ipv6_hdr(skb)->daddr);
+ } else {
+ nexthop = &nh->ipv6_nh;
+ }
+ neigh = ip_neigh_gw6(dev, nexthop);
+ if (likely(!IS_ERR(neigh))) {
+ int ret;
+
+ sock_confirm_neigh(skb, neigh);
+ dev_xmit_recursion_inc();
+ ret = neigh_output(neigh, skb, false);
+ dev_xmit_recursion_dec();
+ rcu_read_unlock_bh();
+ return ret;
+ }
+ rcu_read_unlock_bh();
+ if (dst)
+ IP6_INC_STATS(dev_net(dst->dev),
+ ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+out_drop:
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct net *net = dev_net(dev);
+ int err, ret = NET_XMIT_DROP;
+
+ if (!nh) {
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .flowi6_flags = FLOWI_FLAG_ANYSRC,
+ .flowi6_mark = skb->mark,
+ .flowlabel = ip6_flowinfo(ip6h),
+ .flowi6_oif = dev->ifindex,
+ .flowi6_proto = ip6h->nexthdr,
+ .daddr = ip6h->daddr,
+ .saddr = ip6h->saddr,
+ };
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
+ if (IS_ERR(dst))
+ goto out_drop;
+
+ skb_dst_set(skb, dst);
+ } else if (nh->nh_family != AF_INET6) {
+ goto out_drop;
+ }
+
+ err = bpf_out_neigh_v6(net, skb, dev, nh);
+ if (unlikely(net_xmit_eval(err)))
+ dev->stats.tx_errors++;
+ else
+ ret = NET_XMIT_SUCCESS;
+ goto out_xmit;
+out_drop:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+out_xmit:
+ return ret;
+}
+#else
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+#endif /* CONFIG_IPV6 */
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
+ struct net_device *dev, struct bpf_nh_params *nh)
+{
+ u32 hh_len = LL_RESERVED_SPACE(dev);
+ struct neighbour *neigh;
+ bool is_v6gw = false;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ goto out_drop;
+ }
+
+ skb->dev = dev;
+ skb->tstamp = 0;
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_realloc_headroom(skb, hh_len);
+ if (unlikely(!skb2)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+ consume_skb(skb);
+ skb = skb2;
+ }
+
+ rcu_read_lock_bh();
+ if (!nh) {
+ struct dst_entry *dst = skb_dst(skb);
+ struct rtable *rt = container_of(dst, struct rtable, dst);
+
+ neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+ } else if (nh->nh_family == AF_INET6) {
+ neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
+ is_v6gw = true;
+ } else if (nh->nh_family == AF_INET) {
+ neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
+ } else {
+ rcu_read_unlock_bh();
+ goto out_drop;
+ }
+
+ if (likely(!IS_ERR(neigh))) {
+ int ret;
+
+ sock_confirm_neigh(skb, neigh);
+ dev_xmit_recursion_inc();
+ ret = neigh_output(neigh, skb, is_v6gw);
+ dev_xmit_recursion_dec();
+ rcu_read_unlock_bh();
+ return ret;
+ }
+ rcu_read_unlock_bh();
+out_drop:
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ const struct iphdr *ip4h = ip_hdr(skb);
+ struct net *net = dev_net(dev);
+ int err, ret = NET_XMIT_DROP;
+
+ if (!nh) {
+ struct flowi4 fl4 = {
+ .flowi4_flags = FLOWI_FLAG_ANYSRC,
+ .flowi4_mark = skb->mark,
+ .flowi4_tos = RT_TOS(ip4h->tos),
+ .flowi4_oif = dev->ifindex,
+ .flowi4_proto = ip4h->protocol,
+ .daddr = ip4h->daddr,
+ .saddr = ip4h->saddr,
+ };
+ struct rtable *rt;
+
+ rt = ip_route_output_flow(net, &fl4, NULL);
+ if (IS_ERR(rt))
+ goto out_drop;
+ if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
+ ip_rt_put(rt);
+ goto out_drop;
+ }
+
+ skb_dst_set(skb, &rt->dst);
+ }
+
+ err = bpf_out_neigh_v4(net, skb, dev, nh);
+ if (unlikely(net_xmit_eval(err)))
+ dev->stats.tx_errors++;
+ else
+ ret = NET_XMIT_SUCCESS;
+ goto out_xmit;
+out_drop:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+out_xmit:
+ return ret;
+}
+#else
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+#endif /* CONFIG_INET */
+
+static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ struct ethhdr *ethh = eth_hdr(skb);
+
+ if (unlikely(skb->mac_header >= skb->network_header))
+ goto out;
+ bpf_push_mac_rcsum(skb);
+ if (is_multicast_ether_addr(ethh->h_dest))
+ goto out;
+
+ skb_pull(skb, sizeof(*ethh));
+ skb_unset_mac_header(skb);
+ skb_reset_network_header(skb);
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return __bpf_redirect_neigh_v4(skb, dev, nh);
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ return __bpf_redirect_neigh_v6(skb, dev, nh);
+out:
+ kfree_skb(skb);
+ return -ENOTSUPP;
+}
+
+/* Internal, non-exposed redirect flags. */
+enum {
+ BPF_F_NEIGH = (1ULL << 1),
+ BPF_F_PEER = (1ULL << 2),
+ BPF_F_NEXTHOP = (1ULL << 3),
+#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
+};
+
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
struct net_device *dev;
struct sk_buff *clone;
int ret;
- if (unlikely(flags & ~(BPF_F_INGRESS)))
+ if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return -EINVAL;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
@@ -2203,11 +2460,46 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
+int skb_do_redirect(struct sk_buff *skb)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct net *net = dev_net(skb->dev);
+ struct net_device *dev;
+ u32 flags = ri->flags;
+
+ dev = dev_get_by_index_rcu(net, ri->tgt_index);
+ ri->tgt_index = 0;
+ ri->flags = 0;
+ if (unlikely(!dev))
+ goto out_drop;
+ if (flags & BPF_F_PEER) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (unlikely(!ops->ndo_get_peer_dev ||
+ !skb_at_tc_ingress(skb)))
+ goto out_drop;
+ dev = ops->ndo_get_peer_dev(dev);
+ if (unlikely(!dev ||
+ !is_skb_forwardable(dev, skb) ||
+ net_eq(net, dev_net(dev))))
+ goto out_drop;
+ skb->dev = dev;
+ return -EAGAIN;
+ }
+ return flags & BPF_F_NEIGH ?
+ __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
+ &ri->nh : NULL) :
+ __bpf_redirect(skb, dev, flags);
+out_drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- if (unlikely(flags & ~(BPF_F_INGRESS)))
+ if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return TC_ACT_SHOT;
ri->flags = flags;
@@ -2216,29 +2508,63 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
return TC_ACT_REDIRECT;
}
-int skb_do_redirect(struct sk_buff *skb)
+static const struct bpf_func_proto bpf_redirect_proto = {
+ .func = bpf_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct net_device *dev;
- dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
- ri->tgt_index = 0;
- if (unlikely(!dev)) {
- kfree_skb(skb);
- return -EINVAL;
- }
+ if (unlikely(flags))
+ return TC_ACT_SHOT;
+
+ ri->flags = BPF_F_PEER;
+ ri->tgt_index = ifindex;
- return __bpf_redirect(skb, dev, ri->flags);
+ return TC_ACT_REDIRECT;
}
-static const struct bpf_func_proto bpf_redirect_proto = {
- .func = bpf_redirect,
+static const struct bpf_func_proto bpf_redirect_peer_proto = {
+ .func = bpf_redirect_peer,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
+ int, plen, u64, flags)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+ if (unlikely((plen && plen < sizeof(*params)) || flags))
+ return TC_ACT_SHOT;
+
+ ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
+ ri->tgt_index = ifindex;
+
+ BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
+ if (plen)
+ memcpy(&ri->nh, params, sizeof(ri->nh));
+
+ return TC_ACT_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_redirect_neigh_proto = {
+ .func = bpf_redirect_neigh,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
{
msg->apply_bytes = bytes;
@@ -2704,6 +3030,23 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
};
+
+BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ return sock_cgroup_classid(&sk->sk_cgrp_data);
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
+ .func = bpf_skb_cgroup_classid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
#endif
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
@@ -3215,6 +3558,48 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb)
SKB_MAX_ALLOC;
}
+BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+ u32, mode, u64, flags)
+{
+ u32 len_diff_abs = abs(len_diff);
+ bool shrink = len_diff < 0;
+ int ret = 0;
+
+ if (unlikely(flags || mode))
+ return -EINVAL;
+ if (unlikely(len_diff_abs > 0xfffU))
+ return -EFAULT;
+
+ if (!shrink) {
+ ret = skb_cow(skb, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+ __skb_push(skb, len_diff_abs);
+ memset(skb->data, 0, len_diff_abs);
+ } else {
+ if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
+ return -ENOMEM;
+ __skb_pull(skb, len_diff_abs);
+ }
+ bpf_compute_data_end_sk_skb(skb);
+ if (tls_sw_has_ctx_rx(skb->sk)) {
+ struct strp_msg *rxm = strp_msg(skb);
+
+ rxm->full_len += len_diff;
+ }
+ return ret;
+}
+
+static const struct bpf_func_proto sk_skb_adjust_room_proto = {
+ .func = sk_skb_adjust_room,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{
@@ -3803,19 +4188,18 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
-BTF_ID_LIST(bpf_skb_output_btf_ids)
-BTF_ID(struct, sk_buff)
+BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)
const struct bpf_func_proto bpf_skb_output_proto = {
.func = bpf_skb_event_output,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_skb_output_btf_ids[0],
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
- .btf_id = bpf_skb_output_btf_ids,
};
static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -4086,18 +4470,17 @@ static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
struct cgroup *cgrp;
+ sk = sk_to_full_sk(sk);
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
return cgroup_id(cgrp);
}
BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
{
- struct sock *sk = skb_to_full_sk(skb);
-
- if (!sk || !sk_fullsock(sk))
- return 0;
-
- return __bpf_sk_cgroup_id(sk);
+ return __bpf_sk_cgroup_id(skb->sk);
}
static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
@@ -4113,6 +4496,10 @@ static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
struct cgroup *ancestor;
struct cgroup *cgrp;
+ sk = sk_to_full_sk(sk);
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
ancestor = cgroup_ancestor(cgrp, ancestor_level);
if (!ancestor)
@@ -4124,12 +4511,7 @@ static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
ancestor_level)
{
- struct sock *sk = skb_to_full_sk(skb);
-
- if (!sk || !sk_fullsock(sk))
- return 0;
-
- return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
+ return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
}
static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
@@ -4149,7 +4531,7 @@ static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
.func = bpf_sk_cgroup_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_SOCKET,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
};
BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
@@ -4161,7 +4543,7 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
.func = bpf_sk_ancestor_cgroup_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_SOCKET,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.arg2_type = ARG_ANYTHING,
};
#endif
@@ -4199,24 +4581,23 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
-BTF_ID_LIST(bpf_xdp_output_btf_ids)
-BTF_ID(struct, xdp_buff)
+BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)
const struct bpf_func_proto bpf_xdp_output_proto = {
.func = bpf_xdp_event_output,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_xdp_output_btf_ids[0],
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
- .btf_id = bpf_xdp_output_btf_ids,
};
BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
- return skb->sk ? sock_gen_cookie(skb->sk) : 0;
+ return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
}
static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
@@ -4228,7 +4609,7 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
- return sock_gen_cookie(ctx->sk);
+ return __sock_gen_cookie(ctx->sk);
}
static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
@@ -4240,7 +4621,7 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
{
- return sock_gen_cookie(ctx);
+ return __sock_gen_cookie(ctx);
}
static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
@@ -4252,7 +4633,7 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
- return sock_gen_cookie(ctx->sk);
+ return __sock_gen_cookie(ctx->sk);
}
static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
@@ -4265,7 +4646,7 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
static u64 __bpf_get_netns_cookie(struct sock *sk)
{
#ifdef CONFIG_NET_NS
- return net_gen_cookie(sk ? sk->sk_net.net : &init_net);
+ return __net_gen_cookie(sk ? sk->sk_net.net : &init_net);
#else
return 0;
#endif
@@ -4313,10 +4694,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-#define SOCKOPT_CC_REINIT (1 << 0)
-
static int _bpf_setsockopt(struct sock *sk, int level, int optname,
- char *optval, int optlen, u32 flags)
+ char *optval, int optlen)
{
char devname[IFNAMSIZ];
int val, valbool;
@@ -4354,7 +4733,8 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE,
SK_PACING_NEEDED);
- sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
+ sk->sk_max_pacing_rate = (val == ~0U) ?
+ ~0UL : (unsigned int)val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate);
break;
@@ -4449,16 +4829,15 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
sk->sk_prot->setsockopt == tcp_setsockopt) {
if (optname == TCP_CONGESTION) {
char name[TCP_CA_NAME_MAX];
- bool reinit = flags & SOCKOPT_CC_REINIT;
strncpy(name, optval, min_t(long, optlen,
TCP_CA_NAME_MAX-1));
name[TCP_CA_NAME_MAX-1] = 0;
- ret = tcp_set_congestion_control(sk, name, false,
- reinit, true);
+ ret = tcp_set_congestion_control(sk, name, false, true);
} else {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
if (optlen != sizeof(int))
return -EINVAL;
@@ -4480,6 +4859,20 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
tp->snd_ssthresh = val;
}
break;
+ case TCP_BPF_DELACK_MAX:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_DELACK_MAX ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_delack_max = timeout;
+ break;
+ case TCP_BPF_RTO_MIN:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_RTO_MIN ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_rto_min = timeout;
+ break;
case TCP_SAVE_SYN:
if (val < 0 || val > 1)
ret = -EINVAL;
@@ -4513,6 +4906,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
else
icsk->icsk_user_timeout = val;
break;
+ case TCP_NOTSENT_LOWAT:
+ tp->notsent_lowat = val;
+ sk->sk_write_space(sk);
+ break;
+ case TCP_WINDOW_CLAMP:
+ ret = tcp_set_window_clamp(sk, val);
+ break;
default:
ret = -EINVAL;
}
@@ -4550,9 +4950,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
tp = tcp_sk(sk);
if (optlen <= 0 || !tp->saved_syn ||
- optlen > tp->saved_syn[0])
+ optlen > tcp_saved_syn_len(tp->saved_syn))
goto err_clear;
- memcpy(optval, tp->saved_syn + 1, optlen);
+ memcpy(optval, tp->saved_syn->data, optlen);
break;
default:
goto err_clear;
@@ -4600,9 +5000,7 @@ err_clear:
BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
int, level, int, optname, char *, optval, int, optlen)
{
- u32 flags = 0;
- return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen,
- flags);
+ return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
}
static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
@@ -4636,11 +5034,7 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
- u32 flags = 0;
- if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
- flags |= SOCKOPT_CC_REINIT;
- return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen,
- flags);
+ return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
@@ -4654,9 +5048,99 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
+ int optname, const u8 **start)
+{
+ struct sk_buff *syn_skb = bpf_sock->syn_skb;
+ const u8 *hdr_start;
+ int ret;
+
+ if (syn_skb) {
+ /* sk is a request_sock here */
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = syn_skb->data;
+ ret = tcp_hdrlen(syn_skb);
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = skb_network_header(syn_skb);
+ ret = skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+ hdr_start = skb_mac_header(syn_skb);
+ ret = skb_mac_header_len(syn_skb) +
+ skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
+ }
+ } else {
+ struct sock *sk = bpf_sock->sk;
+ struct saved_syn *saved_syn;
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ /* synack retransmit. bpf_sock->syn_skb will
+ * not be available. It has to resort to
+ * saved_syn (if it is saved).
+ */
+ saved_syn = inet_reqsk(sk)->saved_syn;
+ else
+ saved_syn = tcp_sk(sk)->saved_syn;
+
+ if (!saved_syn)
+ return -ENOENT;
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen;
+ ret = saved_syn->tcp_hdrlen;
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen;
+ ret = saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+
+ /* TCP_SAVE_SYN may not have saved the mac hdr */
+ if (!saved_syn->mac_hdrlen)
+ return -ENOENT;
+
+ hdr_start = saved_syn->data;
+ ret = saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+ }
+ }
+
+ *start = hdr_start;
+ return ret;
+}
+
BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
+ if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
+ optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
+ int ret, copy_len = 0;
+ const u8 *start;
+
+ ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
+ if (ret > 0) {
+ copy_len = ret;
+ if (optlen < copy_len) {
+ copy_len = optlen;
+ ret = -ENOSPC;
+ }
+
+ memcpy(optval, start, copy_len);
+ }
+
+ /* Zero out unused buffer at the end */
+ memset(optval + copy_len, 0, optlen - copy_len);
+
+ return ret;
+ }
+
return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
@@ -4794,7 +5278,6 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
memcpy(params->smac, dev->dev_addr, ETH_ALEN);
params->h_vlan_TCI = 0;
params->h_vlan_proto = 0;
- params->ifindex = dev->ifindex;
return 0;
}
@@ -4891,6 +5374,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
dev = nhc->nhc_dev;
params->rt_metric = res.fi->fib_priority;
+ params->ifindex = dev->ifindex;
/* xdp and cls_bpf programs are run in RCU-bh so
* rcu_read_lock_bh is not needed here
@@ -5016,6 +5500,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
dev = res.nh->fib_nh_dev;
params->rt_metric = res.f6i->fib6_metric;
+ params->ifindex = dev->ifindex;
/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
* not needed here.
@@ -5601,7 +6086,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- if (sk_is_refcounted(sk))
+ if (sk && sk_is_refcounted(sk))
sock_gen_put(sk);
return 0;
}
@@ -5610,7 +6095,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = {
.func = bpf_sk_release,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
};
BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
@@ -5992,7 +6477,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
u32 cookie;
int ret;
- if (unlikely(th_len < sizeof(*th)))
+ if (unlikely(!sk || th_len < sizeof(*th)))
return -EINVAL;
/* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
@@ -6045,7 +6530,7 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
.gpl_only = true,
.pkt_access = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_PTR_TO_MEM,
@@ -6059,7 +6544,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
u32 cookie;
u16 mss;
- if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+ if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
return -EINVAL;
if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
@@ -6114,7 +6599,7 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
.gpl_only = true, /* __cookie_v*_init_sequence() is GPL */
.pkt_access = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_PTR_TO_MEM,
@@ -6123,7 +6608,7 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
{
- if (flags != 0)
+ if (!sk || flags != 0)
return -EINVAL;
if (!skb_at_tc_ingress(skb))
return -EOPNOTSUPP;
@@ -6147,7 +6632,233 @@ static const struct bpf_func_proto bpf_sk_assign_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
+ u8 search_kind, const u8 *magic,
+ u8 magic_len, bool *eol)
+{
+ u8 kind, kind_len;
+
+ *eol = false;
+
+ while (op < opend) {
+ kind = op[0];
+
+ if (kind == TCPOPT_EOL) {
+ *eol = true;
+ return ERR_PTR(-ENOMSG);
+ } else if (kind == TCPOPT_NOP) {
+ op++;
+ continue;
+ }
+
+ if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
+ /* Something is wrong in the received header.
+ * Follow the TCP stack's tcp_parse_options()
+ * and just bail here.
+ */
+ return ERR_PTR(-EFAULT);
+
+ kind_len = op[1];
+ if (search_kind == kind) {
+ if (!magic_len)
+ return op;
+
+ if (magic_len > kind_len - 2)
+ return ERR_PTR(-ENOMSG);
+
+ if (!memcmp(&op[2], magic, magic_len))
+ return op;
+ }
+
+ op += kind_len;
+ }
+
+ return ERR_PTR(-ENOMSG);
+}
+
+BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ void *, search_res, u32, len, u64, flags)
+{
+ bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
+ const u8 *op, *opend, *magic, *search = search_res;
+ u8 search_kind, search_len, copy_len, magic_len;
+ int ret;
+
+ /* 2 byte is the minimal option len except TCPOPT_NOP and
+ * TCPOPT_EOL which are useless for the bpf prog to learn
+ * and this helper disallow loading them also.
+ */
+ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
+ return -EINVAL;
+
+ search_kind = search[0];
+ search_len = search[1];
+
+ if (search_len > len || search_kind == TCPOPT_NOP ||
+ search_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (search_kind == TCPOPT_EXP || search_kind == 253) {
+ /* 16 or 32 bit magic. +2 for kind and kind length */
+ if (search_len != 4 && search_len != 6)
+ return -EINVAL;
+ magic = &search[2];
+ magic_len = search_len - 2;
+ } else {
+ if (search_len)
+ return -EINVAL;
+ magic = NULL;
+ magic_len = 0;
+ }
+
+ if (load_syn) {
+ ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
+ if (ret < 0)
+ return ret;
+
+ opend = op + ret;
+ op += sizeof(struct tcphdr);
+ } else {
+ if (!bpf_sock->skb ||
+ bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ /* This bpf_sock->op cannot call this helper */
+ return -EPERM;
+
+ opend = bpf_sock->skb_data_end;
+ op = bpf_sock->skb->data + sizeof(struct tcphdr);
+ }
+
+ op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
+ &eol);
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+
+ copy_len = op[1];
+ ret = copy_len;
+ if (copy_len > len) {
+ ret = -ENOSPC;
+ copy_len = len;
+ }
+
+ memcpy(search_res, op, copy_len);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
+ .func = bpf_sock_ops_load_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ const void *, from, u32, len, u64, flags)
+{
+ u8 new_kind, new_kind_len, magic_len = 0, *opend;
+ const u8 *op, *new_op, *magic = NULL;
+ struct sk_buff *skb;
+ bool eol;
+
+ if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
+ return -EPERM;
+
+ if (len < 2 || flags)
+ return -EINVAL;
+
+ new_op = from;
+ new_kind = new_op[0];
+ new_kind_len = new_op[1];
+
+ if (new_kind_len > len || new_kind == TCPOPT_NOP ||
+ new_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (new_kind_len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ /* 253 is another experimental kind */
+ if (new_kind == TCPOPT_EXP || new_kind == 253) {
+ if (new_kind_len < 4)
+ return -EINVAL;
+ /* Match for the 2 byte magic also.
+ * RFC 6994: the magic could be 2 or 4 bytes.
+ * Hence, matching by 2 byte only is on the
+ * conservative side but it is the right
+ * thing to do for the 'search-for-duplication'
+ * purpose.
+ */
+ magic = &new_op[2];
+ magic_len = 2;
+ }
+
+ /* Check for duplication */
+ skb = bpf_sock->skb;
+ op = skb->data + sizeof(struct tcphdr);
+ opend = bpf_sock->skb_data_end;
+
+ op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
+ &eol);
+ if (!IS_ERR(op))
+ return -EEXIST;
+
+ if (PTR_ERR(op) != -ENOMSG)
+ return PTR_ERR(op);
+
+ if (eol)
+ /* The option has been ended. Treat it as no more
+ * header option can be written.
+ */
+ return -ENOSPC;
+
+ /* No duplication found. Store the header option. */
+ memcpy(opend, from, new_kind_len);
+
+ bpf_sock->remaining_opt_len -= new_kind_len;
+ bpf_sock->skb_data_end += new_kind_len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
+ .func = bpf_sock_ops_store_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ u32, len, u64, flags)
+{
+ if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ return -EPERM;
+
+ if (flags || len < 2)
+ return -EINVAL;
+
+ if (len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ bpf_sock->remaining_opt_len -= len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
+ .func = bpf_sock_ops_reserve_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};
@@ -6164,6 +6875,7 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_skb_change_tail ||
func == sk_skb_change_tail ||
func == bpf_skb_adjust_room ||
+ func == sk_skb_adjust_room ||
func == bpf_skb_pull_data ||
func == sk_skb_pull_data ||
func == bpf_clone_redirect ||
@@ -6180,6 +6892,9 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_lwt_seg6_adjust_srh ||
func == bpf_lwt_seg6_action ||
#endif
+#ifdef CONFIG_INET
+ func == bpf_sock_ops_store_hdr_opt ||
+#endif
func == bpf_lwt_in_push_encap ||
func == bpf_lwt_xmit_push_encap)
return true;
@@ -6283,6 +6998,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_delete_proto;
case BPF_FUNC_setsockopt:
switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
return &bpf_sock_addr_setsockopt_proto;
@@ -6291,6 +7008,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
case BPF_FUNC_getsockopt:
switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
return &bpf_sock_addr_getsockopt_proto;
@@ -6298,7 +7017,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return NULL;
}
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6317,7 +7036,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6419,6 +7138,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return bpf_get_skb_set_tunnel_proto(func_id);
case BPF_FUNC_redirect:
return &bpf_redirect_proto;
+ case BPF_FUNC_redirect_neigh:
+ return &bpf_redirect_neigh_proto;
+ case BPF_FUNC_redirect_peer:
+ return &bpf_redirect_peer_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
@@ -6449,6 +7172,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skb_get_xfrm_state:
return &bpf_skb_get_xfrm_state_proto;
#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_skb_cgroup_classid:
+ return &bpf_skb_cgroup_classid_proto;
+#endif
#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_cgroup_id:
return &bpf_skb_cgroup_id_proto;
@@ -6478,7 +7205,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_assign_proto;
#endif
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6519,7 +7246,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_tcp_gen_syncookie_proto;
#endif
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6551,11 +7278,17 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_INET
+ case BPF_FUNC_load_hdr_opt:
+ return &bpf_sock_ops_load_hdr_opt_proto;
+ case BPF_FUNC_store_hdr_opt:
+ return &bpf_sock_ops_store_hdr_opt_proto;
+ case BPF_FUNC_reserve_hdr_opt:
+ return &bpf_sock_ops_reserve_hdr_opt_proto;
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6601,7 +7334,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_cgroup_classid_curr_proto;
#endif
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6622,6 +7355,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &sk_skb_change_tail_proto;
case BPF_FUNC_skb_change_head:
return &sk_skb_change_head_proto;
+ case BPF_FUNC_skb_adjust_room:
+ return &sk_skb_adjust_room_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
@@ -6643,7 +7378,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skc_lookup_tcp_proto;
#endif
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6654,7 +7389,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skb_load_bytes:
return &bpf_flow_dissector_load_bytes_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6681,7 +7416,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -7350,6 +8085,20 @@ static bool sock_ops_is_valid_access(int off, int size,
return false;
info->reg_type = PTR_TO_SOCKET_OR_NULL;
break;
+ case offsetof(struct bpf_sock_ops, skb_data):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size,
+ size_default);
default:
if (size != size_default)
return false;
@@ -8451,17 +9200,22 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
switch (si->off) {
- case offsetof(struct bpf_sock_ops, op) ...
+ case offsetof(struct bpf_sock_ops, op):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ op),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, op));
+ break;
+
+ case offsetof(struct bpf_sock_ops, replylong[0]) ...
offsetof(struct bpf_sock_ops, replylong[3]):
- BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, op) !=
- sizeof_field(struct bpf_sock_ops_kern, op));
BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
sizeof_field(struct bpf_sock_ops_kern, reply));
BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
sizeof_field(struct bpf_sock_ops_kern, replylong));
off = si->off;
- off -= offsetof(struct bpf_sock_ops, op);
- off += offsetof(struct bpf_sock_ops_kern, op);
+ off -= offsetof(struct bpf_sock_ops, replylong[0]);
+ off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
off);
@@ -8682,6 +9436,49 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct bpf_sock_ops, sk):
SOCK_OPS_GET_SK();
break;
+ case offsetof(struct bpf_sock_ops, skb_data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb_data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb_data_end));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, data));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_len):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, len));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ off = offsetof(struct sk_buff, cb);
+ off += offsetof(struct tcp_skb_cb, tcp_flags);
+ *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
+ tcp_flags),
+ si->dst_reg, si->dst_reg, off);
+ break;
}
return insn - insn_buf;
}
@@ -9356,7 +10153,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -9506,17 +10303,6 @@ BTF_SOCK_TYPE_xxx
u32 btf_sock_ids[MAX_BTF_SOCK_TYPE];
#endif
-static bool check_arg_btf_id(u32 btf_id, u32 arg)
-{
- int i;
-
- /* only one argument, no need to check arg */
- for (i = 0; i < MAX_BTF_SOCK_TYPE; i++)
- if (btf_sock_ids[i] == btf_id)
- return true;
- return false;
-}
-
BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
{
/* tcp6_sock type is not generated in dwarf and hence btf,
@@ -9534,8 +10320,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
.func = bpf_skc_to_tcp6_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
- .arg1_type = ARG_PTR_TO_BTF_ID,
- .check_btf_id = check_arg_btf_id,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
};
@@ -9551,8 +10336,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
.func = bpf_skc_to_tcp_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
- .arg1_type = ARG_PTR_TO_BTF_ID,
- .check_btf_id = check_arg_btf_id,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
};
@@ -9581,8 +10365,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
.func = bpf_skc_to_tcp_timewait_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
- .arg1_type = ARG_PTR_TO_BTF_ID,
- .check_btf_id = check_arg_btf_id,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
};
@@ -9605,8 +10388,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
.func = bpf_skc_to_tcp_request_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
- .arg1_type = ARG_PTR_TO_BTF_ID,
- .check_btf_id = check_arg_btf_id,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
};
@@ -9627,7 +10409,55 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
.func = bpf_skc_to_udp6_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
- .arg1_type = ARG_PTR_TO_BTF_ID,
- .check_btf_id = check_arg_btf_id,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
};
+
+BPF_CALL_1(bpf_sock_from_file, struct file *, file)
+{
+ return (unsigned long)sock_from_file(file);
+}
+
+BTF_ID_LIST(bpf_sock_from_file_btf_ids)
+BTF_ID(struct, socket)
+BTF_ID(struct, file)
+
+const struct bpf_func_proto bpf_sock_from_file_proto = {
+ .func = bpf_sock_from_file,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .ret_btf_id = &bpf_sock_from_file_btf_ids[0],
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_sock_from_file_btf_ids[1],
+};
+
+static const struct bpf_func_proto *
+bpf_sk_base_func_proto(enum bpf_func_id func_id)
+{
+ const struct bpf_func_proto *func;
+
+ switch (func_id) {
+ case BPF_FUNC_skc_to_tcp6_sock:
+ func = &bpf_skc_to_tcp6_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_sock:
+ func = &bpf_skc_to_tcp_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_timewait_sock:
+ func = &bpf_skc_to_tcp_timewait_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_request_sock:
+ func = &bpf_skc_to_tcp_request_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_udp6_sock:
+ func = &bpf_skc_to_udp6_sock_proto;
+ break;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+
+ if (!perfmon_capable())
+ return NULL;
+
+ return func;
+}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 29806eb765cf..6f1adba6695f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -48,7 +48,7 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
memset(flow_dissector, 0, sizeof(*flow_dissector));
for (i = 0; i < key_count; i++, key++) {
- /* User should make sure that every key target offset is withing
+ /* User should make sure that every key target offset is within
* boundaries of unsigned short.
*/
BUG_ON(key->offset > USHRT_MAX);
@@ -932,8 +932,14 @@ bool __skb_flow_dissect(const struct net *net,
int offset = 0;
ops = skb->dev->dsa_ptr->tag_ops;
- if (ops->flow_dissect &&
- !ops->flow_dissect(skb, &proto, &offset)) {
+ /* Tail taggers don't break flow dissection */
+ if (!ops->tail_tag) {
+ if (ops->flow_dissect)
+ ops->flow_dissect(skb, &proto, &offset);
+ else
+ dsa_tag_generic_flow_dissect(skb,
+ &proto,
+ &offset);
hlen -= offset;
nhoff += offset;
}
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index d4474c812b64..715b67f6c62f 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -381,10 +381,8 @@ static void __flow_block_indr_cleanup(void (*release)(void *cb_priv),
list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) {
if (this->release == release &&
- this->indr.cb_priv == cb_priv) {
+ this->indr.cb_priv == cb_priv)
list_move(&this->indr.list, cleanup_list);
- return;
- }
}
}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 80dbf2f4016e..8e582e29a41e 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -80,11 +80,11 @@ static void est_timer(struct timer_list *t)
u64 rate, brate;
est_fetch_counters(est, &b);
- brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log);
- brate -= (est->avbps >> est->ewma_log);
+ brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log);
+ brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log);
- rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log);
- rate -= (est->avpps >> est->ewma_log);
+ rate = (b.packets - est->last_packets) << (10 - est->intvl_log);
+ rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log);
write_seqcount_begin(&est->seq);
est->avbps += brate;
@@ -143,6 +143,9 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
+ if (parm->ewma_log == 0 || parm->ewma_log >= 31)
+ return -EINVAL;
+
est = kzalloc(sizeof(*est), GFP_KERNEL);
if (!est)
return -ENOBUFS;
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index e095fb871d91..6eb2e5ec2c50 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -99,9 +99,14 @@ void gro_cells_destroy(struct gro_cells *gcells)
struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
napi_disable(&cell->napi);
- netif_napi_del(&cell->napi);
+ __netif_napi_del(&cell->napi);
__skb_queue_purge(&cell->napi_skbs);
}
+ /* This barrier is needed because netpoll could access dev->napi_list
+ * under rcu protection.
+ */
+ synchronize_net();
+
free_percpu(gcells->cells);
gcells->cells = NULL;
}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 7d3438215f32..2f7940bcf715 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -39,12 +39,11 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
{
int ret;
- /* Preempt disable is needed to protect per-cpu redirect_info between
- * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
- * access to maps strictly require a rcu_read_lock() for protection,
- * mixing with BH RCU lock doesn't work.
+ /* Migration disable and BH disable are needed to protect per-cpu
+ * redirect_info between BPF prog and skb_do_redirect().
*/
- preempt_disable();
+ migrate_disable();
+ local_bh_disable();
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
@@ -78,7 +77,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
break;
}
- preempt_enable();
+ local_bh_enable();
+ migrate_enable();
return ret;
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8e39e28b0a8d..6d2d557442dc 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -235,6 +235,8 @@ static int neigh_forced_gc(struct neigh_table *tbl)
write_lock(&n->lock);
if ((n->nud_state == NUD_FAILED) ||
+ (tbl->is_multicast &&
+ tbl->is_multicast(n->primary_key)) ||
time_after(tref, n->updated))
remove = true;
write_unlock(&n->lock);
@@ -1243,13 +1245,14 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
old = neigh->nud_state;
err = -EPERM;
- if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
- (old & (NUD_NOARP | NUD_PERMANENT)))
- goto out;
if (neigh->dead) {
NL_SET_ERR_MSG(extack, "Neighbor entry is now dead");
+ new = old;
goto out;
}
+ if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
+ (old & (NUD_NOARP | NUD_PERMANENT)))
+ goto out;
ext_learn_change = neigh_update_ext_learned(neigh, flags, &notify);
@@ -1567,10 +1570,8 @@ static void neigh_proxy_process(struct timer_list *t)
void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb)
{
- unsigned long now = jiffies;
-
- unsigned long sched_next = now + (prandom_u32() %
- NEIGH_VAR(p, PROXY_DELAY));
+ unsigned long sched_next = jiffies +
+ prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY));
if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
kfree_skb(skb);
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 6bbd06f7dc7d..c714e6a9dad4 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -116,6 +116,12 @@ static int dev_seq_show(struct seq_file *seq, void *v)
return 0;
}
+static u32 softnet_backlog_len(struct softnet_data *sd)
+{
+ return skb_queue_len_lockless(&sd->input_pkt_queue) +
+ skb_queue_len_lockless(&sd->process_queue);
+}
+
static struct softnet_data *softnet_get_online(loff_t *pos)
{
struct softnet_data *sd = NULL;
@@ -159,12 +165,17 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
rcu_read_unlock();
#endif
+ /* the index is the CPU id owing this sd. Since offline CPUs are not
+ * displayed, it would be othrwise not trivial for the user-space
+ * mapping the data a specific CPU
+ */
seq_printf(seq,
- "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
sd->processed, sd->dropped, sd->time_squeeze, 0,
0, 0, 0, 0, /* was fastroute */
0, /* was cpu_collision */
- sd->received_rps, flow_limit_count);
+ sd->received_rps, flow_limit_count,
+ softnet_backlog_len(sd), (int)seq->index);
return 0;
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index efec66fa78b7..daf502c13d6d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1027,7 +1027,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct kobject *kobj = &dev->_rx[i].kobj;
- if (!refcount_read(&dev_net(dev)->count))
+ if (!refcount_read(&dev_net(dev)->ns.count))
kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
@@ -1158,8 +1158,8 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
* belongs to the root device it will be reported with just the
* traffic class, so just "0" for TC 0 for example.
*/
- return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) :
- sprintf(buf, "%u\n", tc);
+ return dev->num_tc < 0 ? sprintf(buf, "%d%d\n", tc, dev->num_tc) :
+ sprintf(buf, "%d\n", tc);
}
#ifdef CONFIG_XPS
@@ -1317,8 +1317,8 @@ static const struct attribute_group dql_group = {
static ssize_t xps_cpus_show(struct netdev_queue *queue,
char *buf)
{
+ int cpu, len, ret, num_tc = 1, tc = 0;
struct net_device *dev = queue->dev;
- int cpu, len, num_tc = 1, tc = 0;
struct xps_dev_maps *dev_maps;
cpumask_var_t mask;
unsigned long index;
@@ -1328,22 +1328,31 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
index = get_netdev_queue_index(queue);
+ if (!rtnl_trylock())
+ return restart_syscall();
+
if (dev->num_tc) {
/* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
- if (num_tc < 0)
- return -EINVAL;
+ if (num_tc < 0) {
+ ret = -EINVAL;
+ goto err_rtnl_unlock;
+ }
/* If queue belongs to subordinate dev use its map */
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
tc = netdev_txq_to_tc(dev, index);
- if (tc < 0)
- return -EINVAL;
+ if (tc < 0) {
+ ret = -EINVAL;
+ goto err_rtnl_unlock;
+ }
}
- if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err_rtnl_unlock;
+ }
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_cpus_map);
@@ -1366,9 +1375,15 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
}
rcu_read_unlock();
+ rtnl_unlock();
+
len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
free_cpumask_var(mask);
return len < PAGE_SIZE ? len : -EINVAL;
+
+err_rtnl_unlock:
+ rtnl_unlock();
+ return ret;
}
static ssize_t xps_cpus_store(struct netdev_queue *queue,
@@ -1396,7 +1411,13 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
return err;
}
+ if (!rtnl_trylock()) {
+ free_cpumask_var(mask);
+ return restart_syscall();
+ }
+
err = netif_set_xps_queue(dev, mask, index);
+ rtnl_unlock();
free_cpumask_var(mask);
@@ -1408,22 +1429,29 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
{
+ int j, len, ret, num_tc = 1, tc = 0;
struct net_device *dev = queue->dev;
struct xps_dev_maps *dev_maps;
unsigned long *mask, index;
- int j, len, num_tc = 1, tc = 0;
index = get_netdev_queue_index(queue);
+ if (!rtnl_trylock())
+ return restart_syscall();
+
if (dev->num_tc) {
num_tc = dev->num_tc;
tc = netdev_txq_to_tc(dev, index);
- if (tc < 0)
- return -EINVAL;
+ if (tc < 0) {
+ ret = -EINVAL;
+ goto err_rtnl_unlock;
+ }
}
mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
- if (!mask)
- return -ENOMEM;
+ if (!mask) {
+ ret = -ENOMEM;
+ goto err_rtnl_unlock;
+ }
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_rxqs_map);
@@ -1449,10 +1477,16 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
out_no_maps:
rcu_read_unlock();
+ rtnl_unlock();
+
len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
bitmap_free(mask);
return len < PAGE_SIZE ? len : -EINVAL;
+
+err_rtnl_unlock:
+ rtnl_unlock();
+ return ret;
}
static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
@@ -1478,10 +1512,17 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
return err;
}
+ if (!rtnl_trylock()) {
+ bitmap_free(mask);
+ return restart_syscall();
+ }
+
cpus_read_lock();
err = __netif_set_xps_queue(dev, mask, index, true);
cpus_read_unlock();
+ rtnl_unlock();
+
bitmap_free(mask);
return err ? : len;
}
@@ -1605,7 +1646,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct netdev_queue *queue = dev->_tx + i;
- if (!refcount_read(&dev_net(dev)->count))
+ if (!refcount_read(&dev_net(dev)->ns.count))
queue->kobj.uevent_suppress = 1;
#ifdef CONFIG_BQL
sysfs_remove_group(&queue->kobj, &dql_group);
@@ -1852,7 +1893,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
{
struct device *dev = &ndev->dev;
- if (!refcount_read(&dev_net(ndev)->count))
+ if (!refcount_read(&dev_net(ndev)->ns.count))
dev_set_uevent_suppress(dev, 1);
kobject_get(&dev->kobj);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 944ab214e5ae..2ef3b4557f40 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -19,6 +19,7 @@
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
#include <linux/uidgid.h>
+#include <linux/cookie.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -44,7 +45,7 @@ static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
#endif
struct net init_net = {
- .count = REFCOUNT_INIT(1),
+ .ns.count = REFCOUNT_INIT(1),
.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
#ifdef CONFIG_KEYS
.key_domain = &init_net_key_domain,
@@ -69,16 +70,16 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
-static atomic64_t cookie_gen;
+DEFINE_COOKIE(net_cookie);
-u64 net_gen_cookie(struct net *net)
+u64 __net_gen_cookie(struct net *net)
{
while (1) {
u64 res = atomic64_read(&net->net_cookie);
if (res)
return res;
- res = atomic64_inc_return(&cookie_gen);
+ res = gen_cookie_next(&net_cookie);
atomic64_cmpxchg(&net->net_cookie, 0, res);
}
}
@@ -248,7 +249,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
int id;
- if (refcount_read(&net->count) == 0)
+ if (refcount_read(&net->ns.count) == 0)
return NETNSA_NSID_NOT_ASSIGNED;
spin_lock_bh(&net->nsid_lock);
@@ -328,7 +329,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
int error = 0;
LIST_HEAD(net_exit_list);
- refcount_set(&net->count, 1);
+ refcount_set(&net->ns.count, 1);
refcount_set(&net->passive, 1);
get_random_bytes(&net->hash_mix, sizeof(u32));
net->dev_base_seq = 1;
@@ -1101,7 +1102,10 @@ static int __init net_ns_init(void)
panic("Could not allocate generic netns");
rcu_assign_pointer(init_net.gen, ng);
- net_gen_cookie(&init_net);
+
+ preempt_disable();
+ __net_gen_cookie(&init_net);
+ preempt_enable();
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 41b24cd31562..b49c57d35a88 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -68,9 +68,8 @@ struct update_classid_context {
static int update_classid_sock(const void *v, struct file *file, unsigned n)
{
- int err;
struct update_classid_context *ctx = (void *)v;
- struct socket *sock = sock_from_file(file, &err);
+ struct socket *sock = sock_from_file(file);
if (sock) {
spin_lock(&cgroup_sk_update_lock);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 2338753e936b..960948290001 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -29,6 +29,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/if_vlan.h>
+#include <net/dsa.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/addrconf.h>
@@ -297,7 +298,7 @@ static int netpoll_owner_active(struct net_device *dev)
{
struct napi_struct *napi;
- list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
if (napi->poll_owner == smp_processor_id())
return 1;
}
@@ -657,15 +658,15 @@ EXPORT_SYMBOL_GPL(__netpoll_setup);
int netpoll_setup(struct netpoll *np)
{
- struct net_device *ndev = NULL;
+ struct net_device *ndev = NULL, *dev = NULL;
+ struct net *net = current->nsproxy->net_ns;
struct in_device *in_dev;
int err;
rtnl_lock();
- if (np->dev_name[0]) {
- struct net *net = current->nsproxy->net_ns;
+ if (np->dev_name[0])
ndev = __dev_get_by_name(net, np->dev_name);
- }
+
if (!ndev) {
np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
err = -ENODEV;
@@ -673,6 +674,19 @@ int netpoll_setup(struct netpoll *np)
}
dev_hold(ndev);
+ /* bring up DSA management network devices up first */
+ for_each_netdev(net, dev) {
+ if (!netdev_uses_dsa(dev))
+ continue;
+
+ err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
+ if (err < 0) {
+ np_err(np, "%s failed to open %s\n",
+ np->dev_name, dev->name);
+ goto put;
+ }
+ }
+
if (netdev_master_upper_dev_get(ndev)) {
np_err(np, "%s is a slave device, aborting\n", np->dev_name);
err = -EBUSY;
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9bd4cab7d510..99a431c56f23 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -220,8 +220,7 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
static int update_netprio(const void *v, struct file *file, unsigned n)
{
- int err;
- struct socket *sock = sock_from_file(file, &err);
+ struct socket *sock = sock_from_file(file);
if (sock) {
spin_lock(&cgroup_sk_update_lock);
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index ef98372facf6..f3c690b8c8e3 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -11,6 +11,8 @@
#include <linux/device.h>
#include <net/page_pool.h>
+#include <net/xdp.h>
+
#include <linux/dma-direction.h>
#include <linux/dma-mapping.h>
#include <linux/page-flags.h>
@@ -362,8 +364,9 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page)
* If the page refcnt != 1, then the page will be returned to memory
* subsystem.
*/
-void page_pool_put_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size, bool allow_direct)
+static __always_inline struct page *
+__page_pool_put_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
{
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
@@ -379,15 +382,12 @@ void page_pool_put_page(struct page_pool *pool, struct page *page,
page_pool_dma_sync_for_device(pool, page,
dma_sync_size);
- if (allow_direct && in_serving_softirq())
- if (page_pool_recycle_in_cache(page, pool))
- return;
+ if (allow_direct && in_serving_softirq() &&
+ page_pool_recycle_in_cache(page, pool))
+ return NULL;
- if (!page_pool_recycle_in_ring(pool, page)) {
- /* Cache full, fallback to free pages */
- page_pool_return_page(pool, page);
- }
- return;
+ /* Page found as candidate for recycling */
+ return page;
}
/* Fallback/non-XDP mode: API user have elevated refcnt.
*
@@ -405,9 +405,59 @@ void page_pool_put_page(struct page_pool *pool, struct page *page,
/* Do not replace this with page_pool_return_page() */
page_pool_release_page(pool, page);
put_page(page);
+
+ return NULL;
+}
+
+void page_pool_put_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
+ if (page && !page_pool_recycle_in_ring(pool, page)) {
+ /* Cache full, fallback to free pages */
+ page_pool_return_page(pool, page);
+ }
}
EXPORT_SYMBOL(page_pool_put_page);
+/* Caller must not use data area after call, as this function overwrites it */
+void page_pool_put_page_bulk(struct page_pool *pool, void **data,
+ int count)
+{
+ int i, bulk_len = 0;
+
+ for (i = 0; i < count; i++) {
+ struct page *page = virt_to_head_page(data[i]);
+
+ page = __page_pool_put_page(pool, page, -1, false);
+ /* Approved for bulk recycling in ptr_ring cache */
+ if (page)
+ data[bulk_len++] = page;
+ }
+
+ if (unlikely(!bulk_len))
+ return;
+
+ /* Bulk producer into ptr_ring page_pool cache */
+ page_pool_ring_lock(pool);
+ for (i = 0; i < bulk_len; i++) {
+ if (__ptr_ring_produce(&pool->ring, data[i]))
+ break; /* ring full */
+ }
+ page_pool_ring_unlock(pool);
+
+ /* Hopefully all pages was return into ptr_ring */
+ if (likely(i == bulk_len))
+ return;
+
+ /* ptr_ring cache full, free remaining pages outside producer lock
+ * since put_page() with refcnt == 1 can be an expensive operation
+ */
+ for (; i < bulk_len; i++)
+ page_pool_return_page(pool, data[i]);
+}
+EXPORT_SYMBOL(page_pool_put_page_bulk);
+
static void page_pool_empty_ring(struct page_pool *pool)
{
struct page *page;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 44fdbb9c6e53..105978604ffd 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -922,7 +922,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->min_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: min_pkt_size=%u",
+ sprintf(pg_result, "OK: min_pkt_size=%d",
pkt_dev->min_pkt_size);
return count;
}
@@ -939,7 +939,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->max_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: max_pkt_size=%u",
+ sprintf(pg_result, "OK: max_pkt_size=%d",
pkt_dev->max_pkt_size);
return count;
}
@@ -959,7 +959,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->max_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size);
+ sprintf(pg_result, "OK: pkt_size=%d", pkt_dev->min_pkt_size);
return count;
}
@@ -981,7 +981,7 @@ static ssize_t pktgen_if_write(struct file *file,
i += len;
pkt_dev->nfrags = value;
- sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags);
+ sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags);
return count;
}
if (!strcmp(name, "delay")) {
@@ -1146,7 +1146,7 @@ static ssize_t pktgen_if_write(struct file *file,
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
return -ENOTSUPP;
pkt_dev->burst = value < 1 ? 1 : value;
- sprintf(pg_result, "OK: burst=%d", pkt_dev->burst);
+ sprintf(pg_result, "OK: burst=%u", pkt_dev->burst);
return count;
}
if (!strcmp(name, "node")) {
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index d964a5147f22..e33fde06d528 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -107,6 +107,36 @@ unsigned int ptp_classify_raw(const struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(ptp_classify_raw);
+struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type)
+{
+ u8 *ptr = skb_mac_header(skb);
+
+ if (type & PTP_CLASS_VLAN)
+ ptr += VLAN_HLEN;
+
+ switch (type & PTP_CLASS_PMASK) {
+ case PTP_CLASS_IPV4:
+ ptr += IPV4_HLEN(ptr) + UDP_HLEN;
+ break;
+ case PTP_CLASS_IPV6:
+ ptr += IP6_HLEN + UDP_HLEN;
+ break;
+ case PTP_CLASS_L2:
+ break;
+ default:
+ return NULL;
+ }
+
+ ptr += ETH_HLEN;
+
+ /* Ensure that the entire header is present in this packet. */
+ if (ptr + sizeof(struct ptp_header) > skb->data + skb->len)
+ return NULL;
+
+ return (struct ptp_header *)ptr;
+}
+EXPORT_SYMBOL_GPL(ptp_parse_header);
+
void __init ptp_classifier_init(void)
{
static struct sock_filter ptp_filter[] __initdata = {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 68e0682450c6..3d6ab194d0f5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -139,7 +139,7 @@ bool lockdep_rtnl_is_held(void)
EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */
-static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
static inline int rtm_msgindex(int msgtype)
{
@@ -157,7 +157,7 @@ static inline int rtm_msgindex(int msgtype)
static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
{
- struct rtnl_link **tab;
+ struct rtnl_link __rcu **tab;
if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
protocol = PF_UNSPEC;
@@ -166,7 +166,7 @@ static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
if (!tab)
tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);
- return tab[msgtype];
+ return rcu_dereference_rtnl(tab[msgtype]);
}
static int rtnl_register_internal(struct module *owner,
@@ -183,7 +183,7 @@ static int rtnl_register_internal(struct module *owner,
msgindex = rtm_msgindex(msgtype);
rtnl_lock();
- tab = rtnl_msg_handlers[protocol];
+ tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (tab == NULL) {
tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
if (!tab)
@@ -286,7 +286,8 @@ void rtnl_register(int protocol, int msgtype,
*/
int rtnl_unregister(int protocol, int msgtype)
{
- struct rtnl_link **tab, *link;
+ struct rtnl_link __rcu **tab;
+ struct rtnl_link *link;
int msgindex;
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
@@ -299,7 +300,7 @@ int rtnl_unregister(int protocol, int msgtype)
return -ENOENT;
}
- link = tab[msgindex];
+ link = rtnl_dereference(tab[msgindex]);
rcu_assign_pointer(tab[msgindex], NULL);
rtnl_unlock();
@@ -318,20 +319,21 @@ EXPORT_SYMBOL_GPL(rtnl_unregister);
*/
void rtnl_unregister_all(int protocol)
{
- struct rtnl_link **tab, *link;
+ struct rtnl_link __rcu **tab;
+ struct rtnl_link *link;
int msgindex;
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
rtnl_lock();
- tab = rtnl_msg_handlers[protocol];
+ tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (!tab) {
rtnl_unlock();
return;
}
RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
- link = tab[msgindex];
+ link = rtnl_dereference(tab[msgindex]);
if (!link)
continue;
@@ -1939,7 +1941,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla
if (linfo[IFLA_INFO_KIND]) {
char kind[MODULE_NAME_LEN];
- nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
+ nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
ops = rtnl_link_ops_get(kind);
}
@@ -2953,9 +2955,9 @@ static struct net_device *rtnl_dev_get(struct net *net,
if (!ifname) {
ifname = buffer;
if (ifname_attr)
- nla_strlcpy(ifname, ifname_attr, IFNAMSIZ);
+ nla_strscpy(ifname, ifname_attr, IFNAMSIZ);
else if (altifname_attr)
- nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ);
+ nla_strscpy(ifname, altifname_attr, ALTIFNAMSIZ);
else
return NULL;
}
@@ -2983,7 +2985,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
else
ifname[0] = '\0';
@@ -3264,7 +3266,7 @@ replay:
return err;
if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
else
ifname[0] = '\0';
@@ -3296,7 +3298,7 @@ replay:
memset(linkinfo, 0, sizeof(linkinfo));
if (linkinfo[IFLA_INFO_KIND]) {
- nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+ nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
ops = rtnl_link_ops_get(kind);
} else {
kind[0] = '\0';
@@ -3437,26 +3439,15 @@ replay:
dev->ifindex = ifm->ifi_index;
- if (ops->newlink) {
+ if (ops->newlink)
err = ops->newlink(link_net ? : net, dev, tb, data, extack);
- /* Drivers should call free_netdev() in ->destructor
- * and unregister it on failure after registration
- * so that device could be finally freed in rtnl_unlock.
- */
- if (err < 0) {
- /* If device is not registered at all, free it now */
- if (dev->reg_state == NETREG_UNINITIALIZED ||
- dev->reg_state == NETREG_UNREGISTERED)
- free_netdev(dev);
- goto out;
- }
- } else {
+ else
err = register_netdevice(dev);
- if (err < 0) {
- free_netdev(dev);
- goto out;
- }
+ if (err < 0) {
+ free_netdev(dev);
+ goto out;
}
+
err = rtnl_configure_link(dev, ifm);
if (err < 0)
goto out_unregister;
@@ -3709,13 +3700,13 @@ static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
}
-static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
+static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
- struct net_device *dev;
+ size_t min_ifinfo_dump_size = 0;
struct nlattr *tb[IFLA_MAX+1];
u32 ext_filter_mask = 0;
- u16 min_ifinfo_dump_size = 0;
+ struct net_device *dev;
int hdrlen;
/* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
@@ -3735,9 +3726,8 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
*/
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
- min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size,
- if_nlmsg_size(dev,
- ext_filter_mask));
+ min_ifinfo_dump_size = max(min_ifinfo_dump_size,
+ if_nlmsg_size(dev, ext_filter_mask));
}
rcu_read_unlock();
@@ -3755,7 +3745,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
s_idx = 1;
for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
- struct rtnl_link **tab;
+ struct rtnl_link __rcu **tab;
struct rtnl_link *link;
rtnl_dumpit_func dumpit;
@@ -3769,7 +3759,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
if (!tab)
continue;
- link = tab[type];
+ link = rcu_dereference_rtnl(tab[type]);
if (!link)
continue;
@@ -5494,7 +5484,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
struct sock *rtnl;
rtnl_dumpit_func dumpit;
- u16 min_dump_alloc = 0;
+ u32 min_dump_alloc = 0;
link = rtnl_get_link(family, type);
if (!link || !link->dumpit) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2b48cb0cc684..785daff48030 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -249,6 +249,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
fclones->skb2.fclone = SKB_FCLONE_CLONE;
}
+
+ skb_set_kcov_handle(skb, kcov_common_handle());
+
out:
return skb;
nodata:
@@ -282,6 +285,8 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb,
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
+ skb_set_kcov_handle(skb, kcov_common_handle());
+
return skb;
}
@@ -432,7 +437,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
len += NET_SKB_PAD;
- if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(1024) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
@@ -496,13 +505,17 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
gfp_t gfp_mask)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc;
struct sk_buff *skb;
void *data;
len += NET_SKB_PAD + NET_IP_ALIGN;
- if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(1024) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
@@ -510,6 +523,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
goto skb_success;
}
+ nc = this_cpu_ptr(&napi_alloc_cache);
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
len = SKB_DATA_ALIGN(len);
@@ -712,11 +726,10 @@ EXPORT_SYMBOL(kfree_skb_list);
*
* Must only be called from net_ratelimit()-ed paths.
*
- * Dumps up to can_dump_full whole packets if full_pkt, headers otherwise.
+ * Dumps whole packets if full_pkt, only headers otherwise.
*/
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
{
- static atomic_t can_dump_full = ATOMIC_INIT(5);
struct skb_shared_info *sh = skb_shinfo(skb);
struct net_device *dev = skb->dev;
struct sock *sk = skb->sk;
@@ -726,9 +739,6 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
int i, len, seg_len;
if (full_pkt)
- full_pkt = atomic_dec_if_positive(&can_dump_full) >= 0;
-
- if (full_pkt)
len = skb->len;
else
len = min_t(int, skb->len, MAX_HEADER + 128);
@@ -841,7 +851,7 @@ EXPORT_SYMBOL(consume_skb);
#endif
/**
- * consume_stateless_skb - free an skbuff, assuming it is stateless
+ * __consume_stateless_skb - free an skbuff, assuming it is stateless
* @skb: buffer to free
*
* Alike consume_skb(), but this variant assumes that this is the last
@@ -895,15 +905,14 @@ void __kfree_skb_defer(struct sk_buff *skb)
void napi_consume_skb(struct sk_buff *skb, int budget)
{
- if (unlikely(!skb))
- return;
-
/* Zero budget indicate non-NAPI context called us, like netpoll */
if (unlikely(!budget)) {
dev_consume_skb_any(skb);
return;
}
+ lockdep_assert_in_softirq();
+
if (!skb_unref(skb))
return;
@@ -2018,6 +2027,12 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
skb->csum = csum_block_sub(skb->csum,
skb_checksum(skb, len, delta, 0),
len);
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
+ int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
+
+ if (offset + sizeof(__sum16) > hdlen)
+ return -EINVAL;
}
return __pskb_trim(skb, len);
}
@@ -2725,19 +2740,20 @@ EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
- u8 *to, int len, __wsum csum)
+ u8 *to, int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
int pos = 0;
+ __wsum csum = 0;
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
csum = csum_partial_copy_nocheck(skb->data + offset, to,
- copy, csum);
+ copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -2767,7 +2783,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
vaddr = kmap_atomic(p);
csum2 = csum_partial_copy_nocheck(vaddr + p_off,
to + copied,
- p_len, 0);
+ p_len);
kunmap_atomic(vaddr);
csum = csum_block_add(csum, csum2, pos);
pos += p_len;
@@ -2793,7 +2809,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
copy = len;
csum2 = skb_copy_and_csum_bits(frag_iter,
offset - start,
- to, copy, 0);
+ to, copy);
csum = csum_block_add(csum, csum2, pos);
if ((len -= copy) == 0)
return csum;
@@ -3013,7 +3029,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
csum = 0;
if (csstart != skb->len)
csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
- skb->len - csstart, 0);
+ skb->len - csstart);
if (skb->ip_summed == CHECKSUM_PARTIAL) {
long csstuff = csstart + skb->csum_offset;
@@ -3435,6 +3451,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
st->root_skb = st->cur_skb = skb;
st->frag_idx = st->stepped_offset = 0;
st->frag_data = NULL;
+ st->frag_off = 0;
}
EXPORT_SYMBOL(skb_prepare_seq_read);
@@ -3489,14 +3506,27 @@ next_skb:
st->stepped_offset += skb_headlen(st->cur_skb);
while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+ unsigned int pg_idx, pg_off, pg_sz;
+
frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
- block_limit = skb_frag_size(frag) + st->stepped_offset;
+ pg_idx = 0;
+ pg_off = skb_frag_off(frag);
+ pg_sz = skb_frag_size(frag);
+
+ if (skb_frag_must_loop(skb_frag_page(frag))) {
+ pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
+ pg_off = offset_in_page(pg_off + st->frag_off);
+ pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
+ PAGE_SIZE - pg_off);
+ }
+
+ block_limit = pg_sz + st->stepped_offset;
if (abs_offset < block_limit) {
if (!st->frag_data)
- st->frag_data = kmap_atomic(skb_frag_page(frag));
+ st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
- *data = (u8 *) st->frag_data + skb_frag_off(frag) +
+ *data = (u8 *)st->frag_data + pg_off +
(abs_offset - st->stepped_offset);
return block_limit - abs_offset;
@@ -3507,8 +3537,12 @@ next_skb:
st->frag_data = NULL;
}
- st->frag_idx++;
- st->stepped_offset += skb_frag_size(frag);
+ st->stepped_offset += pg_sz;
+ st->frag_off += pg_sz;
+ if (st->frag_off == skb_frag_size(frag)) {
+ st->frag_off = 0;
+ st->frag_idx++;
+ }
}
if (st->frag_data) {
@@ -3648,7 +3682,8 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
unsigned int delta_truesize = 0;
unsigned int delta_len = 0;
struct sk_buff *tail = NULL;
- struct sk_buff *nskb;
+ struct sk_buff *nskb, *tmp;
+ int err;
skb_push(skb, -skb_network_offset(skb) + offset);
@@ -3658,11 +3693,28 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
nskb = list_skb;
list_skb = list_skb->next;
+ err = 0;
+ if (skb_shared(nskb)) {
+ tmp = skb_clone(nskb, GFP_ATOMIC);
+ if (tmp) {
+ consume_skb(nskb);
+ nskb = tmp;
+ err = skb_unclone(nskb, GFP_ATOMIC);
+ } else {
+ err = -ENOMEM;
+ }
+ }
+
if (!tail)
skb->next = nskb;
else
tail->next = nskb;
+ if (unlikely(err)) {
+ nskb->next = list_skb;
+ goto err_linearize;
+ }
+
tail = nskb;
delta_len += nskb->len;
@@ -3934,7 +3986,7 @@ normal:
skb_copy_and_csum_bits(head_skb, offset,
skb_put(nskb,
len),
- len, 0);
+ len);
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
} else {
@@ -4555,7 +4607,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
if (skb && (skb_next = skb_peek(q))) {
icmp_next = is_icmp_err_skb(skb_next);
if (icmp_next)
- sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin;
+ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
}
spin_unlock_irqrestore(&q->lock, flags);
@@ -5436,7 +5488,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
goto err_free;
skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
+ if (!skb_transport_header_was_set(skb))
+ skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
return skb;
@@ -5561,6 +5614,73 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
}
EXPORT_SYMBOL(skb_vlan_push);
+/**
+ * skb_eth_pop() - Drop the Ethernet header at the head of a packet
+ *
+ * @skb: Socket buffer to modify
+ *
+ * Drop the Ethernet header of @skb.
+ *
+ * Expects that skb->data points to the mac header and that no VLAN tags are
+ * present.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_eth_pop(struct sk_buff *skb)
+{
+ if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
+ skb_network_offset(skb) < ETH_HLEN)
+ return -EPROTO;
+
+ skb_pull_rcsum(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_eth_pop);
+
+/**
+ * skb_eth_push() - Add a new Ethernet header at the head of a packet
+ *
+ * @skb: Socket buffer to modify
+ * @dst: Destination MAC address of the new header
+ * @src: Source MAC address of the new header
+ *
+ * Prepend @skb with a new Ethernet header.
+ *
+ * Expects that skb->data points to the mac header, which must be empty.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
+ const unsigned char *src)
+{
+ struct ethhdr *eth;
+ int err;
+
+ if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
+ return -EPROTO;
+
+ err = skb_cow_head(skb, sizeof(*eth));
+ if (err < 0)
+ return err;
+
+ skb_push(skb, sizeof(*eth));
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ eth = eth_hdr(skb);
+ ether_addr_copy(eth->h_dest, dst);
+ ether_addr_copy(eth->h_source, src);
+ eth->h_proto = skb->protocol;
+
+ skb_postpush_rcsum(skb, eth, sizeof(*eth));
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_eth_push);
+
/* Update the ethertype of hdr and the skb csum value if required. */
static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
__be16 ethertype)
@@ -5725,6 +5845,9 @@ int skb_mpls_dec_ttl(struct sk_buff *skb)
if (unlikely(!eth_p_mpls(skb->protocol)))
return -EINVAL;
+ if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
+ return -ENOMEM;
+
lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
if (!--ttl)
@@ -5955,8 +6078,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
size = SKB_WITH_OVERHEAD(ksize(data));
memcpy((struct skb_shared_info *)(data + size),
- skb_shinfo(skb), offsetof(struct skb_shared_info,
- frags[skb_shinfo(skb)->nr_frags]));
+ skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
if (skb_orphan_frags(skb, gfp_mask)) {
kfree(data);
return -ENOMEM;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 649583158983..25cdbb20f3a0 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -170,10 +170,12 @@ static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
struct scatterlist *sge = sk_msg_elem(msg, i);
u32 len = sge->length;
- if (charge)
- sk_mem_uncharge(sk, len);
- if (!msg->skb)
+ /* When the skb owns the memory we free it from consume_skb path. */
+ if (!msg->skb) {
+ if (charge)
+ sk_mem_uncharge(sk, len);
put_page(sg_page(sge));
+ }
memset(sge, 0, sizeof(*sge));
return len;
}
@@ -397,28 +399,45 @@ out:
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
-static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
+ struct sk_buff *skb)
{
- struct sock *sk = psock->sk;
- int copied = 0, num_sge;
struct sk_msg *msg;
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ return NULL;
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize))
+ return NULL;
+
msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
if (unlikely(!msg))
- return -EAGAIN;
- if (!sk_rmem_schedule(sk, skb, skb->len)) {
- kfree(msg);
- return -EAGAIN;
- }
+ return NULL;
sk_msg_init(msg);
+ return msg;
+}
+
+static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
+ struct sk_psock *psock,
+ struct sock *sk,
+ struct sk_msg *msg)
+{
+ int num_sge, copied;
+
+ /* skb linearize may fail with ENOMEM, but lets simply try again
+ * later if this happens. Under memory pressure we don't want to
+ * drop the skb. We need to linearize the skb so that the mapping
+ * in skb_to_sgvec can not error.
+ */
+ if (skb_linearize(skb))
+ return -EAGAIN;
num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
if (unlikely(num_sge < 0)) {
kfree(msg);
return num_sge;
}
- sk_mem_charge(sk, skb->len);
copied = skb->len;
msg->sg.start = 0;
msg->sg.size = copied;
@@ -430,13 +449,57 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
return copied;
}
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb);
+
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+{
+ struct sock *sk = psock->sk;
+ struct sk_msg *msg;
+
+ /* If we are receiving on the same sock skb->sk is already assigned,
+ * skip memory accounting and owner transition seeing it already set
+ * correctly.
+ */
+ if (unlikely(skb->sk == sk))
+ return sk_psock_skb_ingress_self(psock, skb);
+ msg = sk_psock_create_ingress_msg(sk, skb);
+ if (!msg)
+ return -EAGAIN;
+
+ /* This will transition ownership of the data from the socket where
+ * the BPF program was run initiating the redirect to the socket
+ * we will eventually receive this data on. The data will be released
+ * from skb_consume found in __tcp_bpf_recvmsg() after its been copied
+ * into user buffers.
+ */
+ skb_set_owner_r(skb, sk);
+ return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+}
+
+/* Puts an skb on the ingress queue of the socket already assigned to the
+ * skb. In this case we do not need to check memory limits or skb_set_owner_r
+ * because the skb is already accounted for here.
+ */
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb)
+{
+ struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+ struct sock *sk = psock->sk;
+
+ if (unlikely(!msg))
+ return -EAGAIN;
+ sk_msg_init(msg);
+ return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+}
+
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len, bool ingress)
{
- if (ingress)
- return sk_psock_skb_ingress(psock, skb);
- else
+ if (!ingress) {
+ if (!sock_writeable(psock->sk))
+ return -EAGAIN;
return skb_send_sock_locked(psock->sk, skb, off, len);
+ }
+ return sk_psock_skb_ingress(psock, skb);
}
static void sk_psock_backlog(struct work_struct *work)
@@ -494,14 +557,34 @@ end:
struct sk_psock *sk_psock_init(struct sock *sk, int node)
{
- struct sk_psock *psock = kzalloc_node(sizeof(*psock),
- GFP_ATOMIC | __GFP_NOWARN,
- node);
- if (!psock)
- return NULL;
+ struct sk_psock *psock;
+ struct proto *prot;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ if (inet_csk_has_ulp(sk)) {
+ psock = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ if (sk->sk_user_data) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node);
+ if (!psock) {
+ psock = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ prot = READ_ONCE(sk->sk_prot);
psock->sk = sk;
- psock->eval = __SK_NONE;
+ psock->eval = __SK_NONE;
+ psock->sk_proto = prot;
+ psock->saved_unhash = prot->unhash;
+ psock->saved_close = prot->close;
+ psock->saved_write_space = sk->sk_write_space;
INIT_LIST_HEAD(&psock->link);
spin_lock_init(&psock->link_lock);
@@ -516,6 +599,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
rcu_assign_sk_user_data_nocopy(sk, psock);
sock_hold(sk);
+out:
+ write_unlock_bh(&sk->sk_callback_lock);
return psock;
}
EXPORT_SYMBOL_GPL(sk_psock_init);
@@ -603,6 +688,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
rcu_assign_sk_user_data(sk, NULL);
if (psock->progs.skb_parser)
sk_psock_stop_strp(sk, psock);
+ else if (psock->progs.skb_verdict)
+ sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
@@ -660,19 +747,8 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
struct sk_buff *skb)
{
- int ret;
-
- skb->sk = psock->sk;
bpf_compute_data_end_sk_skb(skb);
- ret = bpf_prog_run_pin_on_cpu(prog, skb);
- /* strparser clones the skb before handing it to a upper layer,
- * meaning skb_orphan has been called. We NULL sk on the way out
- * to ensure we don't trigger a BUG_ON() in skb/sk operations
- * later and because we are not charging the memory of this skb
- * to any socket yet.
- */
- skb->sk = NULL;
- return ret;
+ return bpf_prog_run_pin_on_cpu(prog, skb);
}
static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
@@ -687,38 +763,35 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
- bool ingress;
sk_other = tcp_skb_bpf_redirect_fetch(skb);
+ /* This error is a buggy BPF program, it returned a redirect
+ * return code, but then didn't set a redirect interface.
+ */
if (unlikely(!sk_other)) {
kfree_skb(skb);
return;
}
psock_other = sk_psock(sk_other);
+ /* This error indicates the socket is being torn down or had another
+ * error that caused the pipe to break. We can't send a packet on
+ * a socket that is in this state so we drop the skb.
+ */
if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
kfree_skb(skb);
return;
}
- ingress = tcp_skb_bpf_ingress(skb);
- if ((!ingress && sock_writeable(sk_other)) ||
- (ingress &&
- atomic_read(&sk_other->sk_rmem_alloc) <=
- sk_other->sk_rcvbuf)) {
- if (!ingress)
- skb_set_owner_w(skb, sk_other);
- skb_queue_tail(&psock_other->ingress_skb, skb);
- schedule_work(&psock_other->work);
- } else {
- kfree_skb(skb);
- }
+ skb_queue_tail(&psock_other->ingress_skb, skb);
+ schedule_work(&psock_other->work);
}
-static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)
+static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
{
switch (verdict) {
case __SK_REDIRECT:
+ skb_set_owner_r(skb, sk);
sk_psock_skb_redirect(skb);
break;
case __SK_PASS:
@@ -736,11 +809,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
rcu_read_lock();
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
+ /* We skip full set_owner_r here because if we do a SK_PASS
+ * or SK_DROP we can skip skb memory accounting and use the
+ * TLS context.
+ */
+ skb->sk = psock->sk;
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb->sk = NULL;
}
- sk_psock_tls_verdict_apply(skb, ret);
+ sk_psock_tls_verdict_apply(skb, psock->sk, ret);
rcu_read_unlock();
return ret;
}
@@ -749,7 +828,9 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
+ struct tcp_skb_cb *tcp;
struct sock *sk_other;
+ int err = -EIO;
switch (verdict) {
case __SK_PASS:
@@ -758,16 +839,24 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
goto out_free;
}
- if (atomic_read(&sk_other->sk_rmem_alloc) <=
- sk_other->sk_rcvbuf) {
- struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
- tcp->bpf.flags |= BPF_F_INGRESS;
+ tcp = TCP_SKB_CB(skb);
+ tcp->bpf.flags |= BPF_F_INGRESS;
+
+ /* If the queue is empty then we can submit directly
+ * into the msg queue. If its not empty we have to
+ * queue work otherwise we may get OOO data. Otherwise,
+ * if sk_psock_skb_ingress errors will be handled by
+ * retrying later from workqueue.
+ */
+ if (skb_queue_empty(&psock->ingress_skb)) {
+ err = sk_psock_skb_ingress_self(psock, skb);
+ }
+ if (err < 0) {
skb_queue_tail(&psock->ingress_skb, skb);
schedule_work(&psock->work);
- break;
}
- goto out_free;
+ break;
case __SK_REDIRECT:
sk_psock_skb_redirect(skb);
break;
@@ -792,9 +881,9 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
kfree_skb(skb);
goto out;
}
+ skb_set_owner_r(skb, sk);
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
- skb_orphan(skb);
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
@@ -817,8 +906,11 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
rcu_read_lock();
prog = READ_ONCE(psock->progs.skb_parser);
- if (likely(prog))
+ if (likely(prog)) {
+ skb->sk = psock->sk;
ret = sk_psock_bpf_run(psock, prog, skb);
+ skb->sk = NULL;
+ }
rcu_read_unlock();
return ret;
}
@@ -842,6 +934,57 @@ static void sk_psock_strp_data_ready(struct sock *sk)
rcu_read_unlock();
}
+static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t orig_len)
+{
+ struct sock *sk = (struct sock *)desc->arg.data;
+ struct sk_psock *psock;
+ struct bpf_prog *prog;
+ int ret = __SK_DROP;
+ int len = skb->len;
+
+ /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb) {
+ desc->error = -ENOMEM;
+ return 0;
+ }
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ len = 0;
+ kfree_skb(skb);
+ goto out;
+ }
+ skb_set_owner_r(skb, sk);
+ prog = READ_ONCE(psock->progs.skb_verdict);
+ if (likely(prog)) {
+ tcp_skb_bpf_redirect_clear(skb);
+ ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ }
+ sk_psock_verdict_apply(psock, skb, ret);
+out:
+ rcu_read_unlock();
+ return len;
+}
+
+static void sk_psock_verdict_data_ready(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ read_descriptor_t desc;
+
+ if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
+ return;
+
+ desc.arg.data = sk;
+ desc.error = 0;
+ desc.count = 1;
+
+ sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
+}
+
static void sk_psock_write_space(struct sock *sk)
{
struct sk_psock *psock;
@@ -871,6 +1014,19 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
return strp_init(&psock->parser.strp, sk, &cb);
}
+void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_parser *parser = &psock->parser;
+
+ if (parser->enabled)
+ return;
+
+ parser->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_verdict_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+ parser->enabled = true;
+}
+
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
@@ -896,3 +1052,15 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
strp_stop(&parser->strp);
parser->enabled = false;
}
+
+void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_parser *parser = &psock->parser;
+
+ if (!parser->enabled)
+ return;
+
+ sk->sk_data_ready = parser->saved_data_ready;
+ parser->saved_data_ready = NULL;
+ parser->enabled = false;
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c5c6b18eff4..bbcd4b97eddd 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -413,18 +413,6 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
return 0;
}
-static void sock_warn_obsolete_bsdism(const char *name)
-{
- static int warned;
- static char warncomm[TASK_COMM_LEN];
- if (strcmp(warncomm, current->comm) && warned < 5) {
- strcpy(warncomm, current->comm);
- pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
- warncomm, name);
- warned++;
- }
-}
-
static bool sock_needs_netstamp(const struct sock *sk)
{
switch (sk->sk_family) {
@@ -769,7 +757,6 @@ static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
} else {
sock_reset_flag(sk, SOCK_RCVTSTAMP);
sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
}
}
@@ -984,7 +971,6 @@ set_sndbuf:
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("setsockopt");
break;
case SO_PASSCRED:
@@ -1007,8 +993,6 @@ set_sndbuf:
__sock_set_timestamps(sk, valbool, true, true);
break;
case SO_TIMESTAMPING_NEW:
- sock_set_flag(sk, SOCK_TSTAMP_NEW);
- fallthrough;
case SO_TIMESTAMPING_OLD:
if (val & ~SOF_TIMESTAMPING_MASK) {
ret = -EINVAL;
@@ -1037,16 +1021,14 @@ set_sndbuf:
}
sk->sk_tsflags = val;
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
SOCK_TIMESTAMPING_RX_SOFTWARE);
- else {
- if (optname == SO_TIMESTAMPING_NEW)
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-
+ else
sock_disable_timestamp(sk,
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
- }
break;
case SO_RCVLOWAT:
@@ -1177,11 +1159,27 @@ set_sndbuf:
sk->sk_ll_usec = val;
}
break;
+ case SO_PREFER_BUSY_POLL:
+ if (valbool && !capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+ break;
+ case SO_BUSY_POLL_BUDGET:
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ } else {
+ if (val < 0 || val > U16_MAX)
+ ret = -EINVAL;
+ else
+ WRITE_ONCE(sk->sk_busy_poll_budget, val);
+ }
+ break;
#endif
case SO_MAX_PACING_RATE:
{
- unsigned long ulval = (val == ~0U) ? ~0UL : val;
+ unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
if (sizeof(ulval) != sizeof(val) &&
optlen >= sizeof(ulval) &&
@@ -1387,7 +1385,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("getsockopt");
break;
case SO_TIMESTAMP_OLD:
@@ -1542,6 +1539,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BUSY_POLL:
v.val = sk->sk_ll_usec;
break;
+ case SO_PREFER_BUSY_POLL:
+ v.val = READ_ONCE(sk->sk_prefer_busy_poll);
+ break;
#endif
case SO_MAX_PACING_RATE:
@@ -2505,7 +2505,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
}
EXPORT_SYMBOL(sk_page_frag_refill);
-static void __lock_sock(struct sock *sk)
+void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
@@ -2827,14 +2827,8 @@ EXPORT_SYMBOL(sock_no_mmap);
void __receive_sock(struct file *file)
{
struct socket *sock;
- int error;
- /*
- * The resulting value of "error" is ignored here since we only
- * need to take action when the file is a socket and testing
- * "sock" for NULL is sufficient.
- */
- sock = sock_from_file(file, &error);
+ sock = sock_from_file(file);
if (sock) {
sock_update_netprioidx(&sock->sk->sk_cgrp_data);
sock_update_classid(&sock->sk->sk_cgrp_data);
@@ -2961,6 +2955,13 @@ void sk_stop_timer(struct sock *sk, struct timer_list* timer)
}
EXPORT_SYMBOL(sk_stop_timer);
+void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
+{
+ if (del_timer_sync(timer))
+ __sock_put(sk);
+}
+EXPORT_SYMBOL(sk_stop_timer_sync);
+
void sock_init_data(struct socket *sock, struct sock *sk)
{
sk_init_common(sk);
@@ -3090,7 +3091,7 @@ EXPORT_SYMBOL(release_sock);
*
* sk_lock.slock unlocked, owned = 1, BH enabled
*/
-bool lock_sock_fast(struct sock *sk)
+bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
{
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
@@ -3108,6 +3109,7 @@ bool lock_sock_fast(struct sock *sk)
* The sk_lock has mutex_lock() semantics here:
*/
mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+ __acquire(&sk->sk_lock.slock);
local_bh_enable();
return true;
}
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index c13ffbd33d8d..c9c45b935f99 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -11,7 +11,7 @@
#include <linux/tcp.h>
#include <linux/workqueue.h>
#include <linux/nospec.h>
-
+#include <linux/cookie.h>
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
@@ -19,16 +19,17 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
static struct workqueue_struct *broadcast_wq;
-static atomic64_t cookie_gen;
-u64 sock_gen_cookie(struct sock *sk)
+DEFINE_COOKIE(sock_cookie);
+
+u64 __sock_gen_cookie(struct sock *sk)
{
while (1) {
u64 res = atomic64_read(&sk->sk_cookie);
if (res)
return res;
- res = atomic64_inc_return(&cookie_gen);
+ res = gen_cookie_next(&sock_cookie);
atomic64_cmpxchg(&sk->sk_cookie, 0, res);
}
}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 119f52a99dc1..64b5ec14ff50 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
#include <linux/bpf.h>
+#include <linux/btf_ids.h>
#include <linux/filter.h>
#include <linux/errno.h>
#include <linux/file.h>
@@ -26,8 +27,6 @@ struct bpf_stab {
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
{
struct bpf_stab *stab;
- u64 cost;
- int err;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@@ -38,29 +37,22 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
- stab = kzalloc(sizeof(*stab), GFP_USER);
+ stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT);
if (!stab)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&stab->map, attr);
raw_spin_lock_init(&stab->lock);
- /* Make sure page count doesn't overflow. */
- cost = (u64) stab->map.max_entries * sizeof(struct sock *);
- err = bpf_map_charge_init(&stab->map.memory, cost);
- if (err)
- goto free_stab;
-
stab->sks = bpf_map_area_alloc(stab->map.max_entries *
sizeof(struct sock *),
stab->map.numa_node);
- if (stab->sks)
- return &stab->map;
- err = -ENOMEM;
- bpf_map_charge_finish(&stab->map.memory);
-free_stab:
- kfree(stab);
- return ERR_PTR(err);
+ if (!stab->sks) {
+ kfree(stab);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &stab->map;
}
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
@@ -147,8 +139,8 @@ static void sock_map_add_link(struct sk_psock *psock,
static void sock_map_del_link(struct sock *sk,
struct sk_psock *psock, void *link_raw)
{
+ bool strp_stop = false, verdict_stop = false;
struct sk_psock_link *link, *tmp;
- bool strp_stop = false;
spin_lock_bh(&psock->link_lock);
list_for_each_entry_safe(link, tmp, &psock->link, list) {
@@ -158,14 +150,19 @@ static void sock_map_del_link(struct sock *sk,
map);
if (psock->parser.enabled && stab->progs.skb_parser)
strp_stop = true;
+ if (psock->parser.enabled && stab->progs.skb_verdict)
+ verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
}
}
spin_unlock_bh(&psock->link_lock);
- if (strp_stop) {
+ if (strp_stop || verdict_stop) {
write_lock_bh(&sk->sk_callback_lock);
- sk_psock_stop_strp(sk, psock);
+ if (strp_stop)
+ sk_psock_stop_strp(sk, psock);
+ else
+ sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
}
}
@@ -184,8 +181,6 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
{
struct proto *prot;
- sock_owned_by_me(sk);
-
switch (sk->sk_type) {
case SOCK_STREAM:
prot = tcp_bpf_get_proto(sk, psock);
@@ -231,20 +226,21 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
{
struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
struct sk_psock *psock;
- bool skb_progs;
int ret;
skb_verdict = READ_ONCE(progs->skb_verdict);
- skb_parser = READ_ONCE(progs->skb_parser);
- skb_progs = skb_parser && skb_verdict;
- if (skb_progs) {
+ if (skb_verdict) {
skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
if (IS_ERR(skb_verdict))
return PTR_ERR(skb_verdict);
+ }
+
+ skb_parser = READ_ONCE(progs->skb_parser);
+ if (skb_parser) {
skb_parser = bpf_prog_inc_not_zero(skb_parser);
if (IS_ERR(skb_parser)) {
- bpf_prog_put(skb_verdict);
- return PTR_ERR(skb_parser);
+ ret = PTR_ERR(skb_parser);
+ goto out_put_skb_verdict;
}
}
@@ -253,7 +249,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
msg_parser = bpf_prog_inc_not_zero(msg_parser);
if (IS_ERR(msg_parser)) {
ret = PTR_ERR(msg_parser);
- goto out;
+ goto out_put_skb_parser;
}
}
@@ -265,15 +261,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
if (psock) {
if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
- (skb_progs && READ_ONCE(psock->progs.skb_parser))) {
+ (skb_parser && READ_ONCE(psock->progs.skb_parser)) ||
+ (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
sk_psock_put(sk, psock);
ret = -EBUSY;
goto out_progs;
}
} else {
psock = sk_psock_init(sk, map->numa_node);
- if (!psock) {
- ret = -ENOMEM;
+ if (IS_ERR(psock)) {
+ ret = PTR_ERR(psock);
goto out_progs;
}
}
@@ -286,28 +283,32 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
goto out_drop;
write_lock_bh(&sk->sk_callback_lock);
- if (skb_progs && !psock->parser.enabled) {
+ if (skb_parser && skb_verdict && !psock->parser.enabled) {
ret = sk_psock_init_strp(sk, psock);
- if (ret) {
- write_unlock_bh(&sk->sk_callback_lock);
- goto out_drop;
- }
+ if (ret)
+ goto out_unlock_drop;
psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
psock_set_prog(&psock->progs.skb_parser, skb_parser);
sk_psock_start_strp(sk, psock);
+ } else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
+ psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+ sk_psock_start_verdict(sk,psock);
}
write_unlock_bh(&sk->sk_callback_lock);
return 0;
+out_unlock_drop:
+ write_unlock_bh(&sk->sk_callback_lock);
out_drop:
sk_psock_put(sk, psock);
out_progs:
if (msg_parser)
bpf_prog_put(msg_parser);
-out:
- if (skb_progs) {
- bpf_prog_put(skb_verdict);
+out_put_skb_parser:
+ if (skb_parser)
bpf_prog_put(skb_parser);
- }
+out_put_skb_verdict:
+ if (skb_verdict)
+ bpf_prog_put(skb_verdict);
return ret;
}
@@ -322,8 +323,8 @@ static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk)
if (!psock) {
psock = sk_psock_init(sk, map->numa_node);
- if (!psock)
- return -ENOMEM;
+ if (IS_ERR(psock))
+ return PTR_ERR(psock);
}
ret = sock_map_init_proto(sk, psock);
@@ -384,7 +385,7 @@ static void *sock_map_lookup(struct bpf_map *map, void *key)
struct sock *sk;
sk = __sock_map_lookup_elem(map, *(u32 *)key);
- if (!sk || !sk_fullsock(sk))
+ if (!sk)
return NULL;
if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
return NULL;
@@ -402,7 +403,7 @@ static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
if (!sk)
return ERR_PTR(-ENOENT);
- sock_gen_cookie(sk);
+ __sock_gen_cookie(sk);
return &sk->sk_cookie;
}
@@ -478,8 +479,6 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
return -EINVAL;
if (unlikely(idx >= map->max_entries))
return -E2BIG;
- if (inet_csk_has_ulp(sk))
- return -EINVAL;
link = sk_psock_init_link();
if (!link)
@@ -563,10 +562,12 @@ static bool sock_map_sk_state_allowed(const struct sock *sk)
return false;
}
-static int sock_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+ struct sock *sk, u64 flags);
+
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
+ u64 flags)
{
- u32 idx = *(u32 *)key;
struct socket *sock;
struct sock *sk;
int ret;
@@ -595,14 +596,41 @@ static int sock_map_update_elem(struct bpf_map *map, void *key,
sock_map_sk_acquire(sk);
if (!sock_map_sk_state_allowed(sk))
ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
else
- ret = sock_map_update_common(map, idx, sk, flags);
+ ret = sock_hash_update_common(map, key, sk, flags);
sock_map_sk_release(sk);
out:
fput(sock->file);
return ret;
}
+static int sock_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct sock *sk = (struct sock *)value;
+ int ret;
+
+ if (unlikely(!sk || !sk_fullsock(sk)))
+ return -EINVAL;
+
+ if (!sock_map_sk_is_suitable(sk))
+ return -EOPNOTSUPP;
+
+ local_bh_disable();
+ bh_lock_sock(sk);
+ if (!sock_map_sk_state_allowed(sk))
+ ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+ else
+ ret = sock_hash_update_common(map, key, sk, flags);
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ return ret;
+}
+
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
struct bpf_map *, map, void *, key, u64, flags)
{
@@ -681,8 +709,116 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = {
.arg4_type = ARG_ANYTHING,
};
+struct sock_map_seq_info {
+ struct bpf_map *map;
+ struct sock *sk;
+ u32 index;
+};
+
+struct bpf_iter__sockmap {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+ __bpf_md_ptr(void *, key);
+ __bpf_md_ptr(struct sock *, sk);
+};
+
+DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
+ struct bpf_map *map, void *key,
+ struct sock *sk)
+
+static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
+{
+ if (unlikely(info->index >= info->map->max_entries))
+ return NULL;
+
+ info->sk = __sock_map_lookup_elem(info->map, info->index);
+
+ /* can't return sk directly, since that might be NULL */
+ return info;
+}
+
+static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+
+ if (*pos == 0)
+ ++*pos;
+
+ /* pairs with sock_map_seq_stop */
+ rcu_read_lock();
+ return sock_map_seq_lookup_elem(info);
+}
+
+static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ __must_hold(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+
+ ++*pos;
+ ++info->index;
+
+ return sock_map_seq_lookup_elem(info);
+}
+
+static int sock_map_seq_show(struct seq_file *seq, void *v)
+ __must_hold(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+ struct bpf_iter__sockmap ctx = {};
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, !v);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (v) {
+ ctx.key = &info->index;
+ ctx.sk = info->sk;
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void sock_map_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ if (!v)
+ (void)sock_map_seq_show(seq, NULL);
+
+ /* pairs with sock_map_seq_start */
+ rcu_read_unlock();
+}
+
+static const struct seq_operations sock_map_seq_ops = {
+ .start = sock_map_seq_start,
+ .next = sock_map_seq_next,
+ .stop = sock_map_seq_stop,
+ .show = sock_map_seq_show,
+};
+
+static int sock_map_init_seq_private(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct sock_map_seq_info *info = priv_data;
+
+ info->map = aux->map;
+ return 0;
+}
+
+static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
+ .seq_ops = &sock_map_seq_ops,
+ .init_seq_private = sock_map_init_seq_private,
+ .seq_priv_size = sizeof(struct sock_map_seq_info),
+};
+
static int sock_map_btf_id;
const struct bpf_map_ops sock_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
.map_get_next_key = sock_map_get_next_key,
@@ -694,6 +830,7 @@ const struct bpf_map_ops sock_map_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_stab",
.map_btf_id = &sock_map_btf_id,
+ .iter_seq_info = &sock_map_iter_seq_info,
};
struct bpf_shtab_elem {
@@ -829,8 +966,9 @@ static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab,
}
}
- new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
- htab->map.numa_node);
+ new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
if (!new) {
atomic_dec(&htab->count);
return ERR_PTR(-ENOMEM);
@@ -855,8 +993,6 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
WARN_ON_ONCE(!rcu_read_lock_held());
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
- if (inet_csk_has_ulp(sk))
- return -EINVAL;
link = sk_psock_init_link();
if (!link)
@@ -915,45 +1051,6 @@ out_free:
return ret;
}
-static int sock_hash_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
-{
- struct socket *sock;
- struct sock *sk;
- int ret;
- u64 ufd;
-
- if (map->value_size == sizeof(u64))
- ufd = *(u64 *)value;
- else
- ufd = *(u32 *)value;
- if (ufd > S32_MAX)
- return -EINVAL;
-
- sock = sockfd_lookup(ufd, &ret);
- if (!sock)
- return ret;
- sk = sock->sk;
- if (!sk) {
- ret = -EINVAL;
- goto out;
- }
- if (!sock_map_sk_is_suitable(sk)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- sock_map_sk_acquire(sk);
- if (!sock_map_sk_state_allowed(sk))
- ret = -EOPNOTSUPP;
- else
- ret = sock_hash_update_common(map, key, sk, flags);
- sock_map_sk_release(sk);
-out:
- fput(sock->file);
- return ret;
-}
-
static int sock_hash_get_next_key(struct bpf_map *map, void *key,
void *key_next)
{
@@ -971,7 +1068,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key,
if (!elem)
goto find_first_elem;
- elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)),
+ elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)),
struct bpf_shtab_elem, node);
if (elem_next) {
memcpy(key_next, elem_next->key, key_size);
@@ -983,7 +1080,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key,
find_first_elem:
for (; i < htab->buckets_num; i++) {
head = &sock_hash_select_bucket(htab, i)->head;
- elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)),
struct bpf_shtab_elem, node);
if (elem_next) {
memcpy(key_next, elem_next->key, key_size);
@@ -998,7 +1095,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
{
struct bpf_shtab *htab;
int i, err;
- u64 cost;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@@ -1011,7 +1107,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
if (attr->key_size > MAX_BPF_STACK)
return ERR_PTR(-E2BIG);
- htab = kzalloc(sizeof(*htab), GFP_USER);
+ htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -1026,21 +1122,10 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
goto free_htab;
}
- cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) +
- (u64) htab->elem_size * htab->map.max_entries;
- if (cost >= U32_MAX - PAGE_SIZE) {
- err = -EINVAL;
- goto free_htab;
- }
- err = bpf_map_charge_init(&htab->map.memory, cost);
- if (err)
- goto free_htab;
-
htab->buckets = bpf_map_area_alloc(htab->buckets_num *
sizeof(struct bpf_shtab_bucket),
htab->map.numa_node);
if (!htab->buckets) {
- bpf_map_charge_finish(&htab->map.memory);
err = -ENOMEM;
goto free_htab;
}
@@ -1119,7 +1204,7 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
if (!sk)
return ERR_PTR(-ENOENT);
- sock_gen_cookie(sk);
+ __sock_gen_cookie(sk);
return &sk->sk_cookie;
}
@@ -1128,7 +1213,7 @@ static void *sock_hash_lookup(struct bpf_map *map, void *key)
struct sock *sk;
sk = __sock_hash_lookup_elem(map, key);
- if (!sk || !sk_fullsock(sk))
+ if (!sk)
return NULL;
if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
return NULL;
@@ -1217,12 +1302,128 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
.arg4_type = ARG_ANYTHING,
};
+struct sock_hash_seq_info {
+ struct bpf_map *map;
+ struct bpf_shtab *htab;
+ u32 bucket_id;
+};
+
+static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
+ struct bpf_shtab_elem *prev_elem)
+{
+ const struct bpf_shtab *htab = info->htab;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
+ struct hlist_node *node;
+
+ /* try to find next elem in the same bucket */
+ if (prev_elem) {
+ node = rcu_dereference(hlist_next_rcu(&prev_elem->node));
+ elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+ if (elem)
+ return elem;
+
+ /* no more elements, continue in the next bucket */
+ info->bucket_id++;
+ }
+
+ for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
+ bucket = &htab->buckets[info->bucket_id];
+ node = rcu_dereference(hlist_first_rcu(&bucket->head));
+ elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+ if (elem)
+ return elem;
+ }
+
+ return NULL;
+}
+
+static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+
+ if (*pos == 0)
+ ++*pos;
+
+ /* pairs with sock_hash_seq_stop */
+ rcu_read_lock();
+ return sock_hash_seq_find_next(info, NULL);
+}
+
+static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ __must_hold(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+
+ ++*pos;
+ return sock_hash_seq_find_next(info, v);
+}
+
+static int sock_hash_seq_show(struct seq_file *seq, void *v)
+ __must_hold(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+ struct bpf_iter__sockmap ctx = {};
+ struct bpf_shtab_elem *elem = v;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, !elem);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ ctx.key = elem->key;
+ ctx.sk = elem->sk;
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void sock_hash_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ if (!v)
+ (void)sock_hash_seq_show(seq, NULL);
+
+ /* pairs with sock_hash_seq_start */
+ rcu_read_unlock();
+}
+
+static const struct seq_operations sock_hash_seq_ops = {
+ .start = sock_hash_seq_start,
+ .next = sock_hash_seq_next,
+ .stop = sock_hash_seq_stop,
+ .show = sock_hash_seq_show,
+};
+
+static int sock_hash_init_seq_private(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct sock_hash_seq_info *info = priv_data;
+
+ info->map = aux->map;
+ info->htab = container_of(aux->map, struct bpf_shtab, map);
+ return 0;
+}
+
+static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
+ .seq_ops = &sock_hash_seq_ops,
+ .init_seq_private = sock_hash_init_seq_private,
+ .seq_priv_size = sizeof(struct sock_hash_seq_info),
+};
+
static int sock_hash_map_btf_id;
const struct bpf_map_ops sock_hash_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = sock_hash_alloc,
.map_free = sock_hash_free,
.map_get_next_key = sock_hash_get_next_key,
- .map_update_elem = sock_hash_update_elem,
+ .map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_hash_delete_elem,
.map_lookup_elem = sock_hash_lookup,
.map_lookup_elem_sys_only = sock_hash_lookup_sys,
@@ -1230,6 +1431,7 @@ const struct bpf_map_ops sock_hash_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_shtab",
.map_btf_id = &sock_hash_map_btf_id,
+ .iter_seq_info = &sock_hash_iter_seq_info,
};
static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
@@ -1340,3 +1542,62 @@ void sock_map_close(struct sock *sk, long timeout)
release_sock(sk);
saved_close(sk, timeout);
}
+
+static int sock_map_iter_attach_target(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_map *map;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP &&
+ map->map_type != BPF_MAP_TYPE_SOCKHASH)
+ goto put_map;
+
+ if (prog->aux->max_rdonly_access > map->key_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+static struct bpf_iter_reg sock_map_iter_reg = {
+ .target = "sockmap",
+ .attach_target = sock_map_iter_attach_target,
+ .detach_target = sock_map_iter_detach_target,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__sockmap, key),
+ PTR_TO_RDONLY_BUF_OR_NULL },
+ { offsetof(struct bpf_iter__sockmap, sk),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static int __init bpf_sockmap_iter_init(void)
+{
+ sock_map_iter_reg.ctx_arg_info[1].btf_id =
+ btf_sock_ids[BTF_SOCK_TYPE_SOCK];
+ return bpf_iter_reg_target(&sock_map_iter_reg);
+}
+late_initcall(bpf_sockmap_iter_init);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index bbdd3c7b6cb5..b065f0a103ed 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -293,7 +293,7 @@ select_by_hash:
i = j = reciprocal_scale(hash, socks);
while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
i++;
- if (i >= reuse->num_socks)
+ if (i >= socks)
i = 0;
if (i == j)
goto out;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 6ada114bbcca..d86d8d11cfe4 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -22,7 +22,7 @@
#include <net/busy_poll.h>
#include <net/pkt_sched.h>
-static int two __maybe_unused = 2;
+static int two = 2;
static int three = 3;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -546,7 +546,7 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = &two,
},
{
.procname = "devconf_inherit_init_net",
@@ -587,6 +587,19 @@ static struct ctl_table netns_core_table[] = {
{ }
};
+static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str)
+{
+ /* fallback tunnels for initns only */
+ if (!strncmp(str, "initns", 6))
+ sysctl_fb_tunnels_only_for_init_net = 1;
+ /* no fallback tunnels anywhere */
+ else if (!strncmp(str, "none", 4))
+ sysctl_fb_tunnels_only_for_init_net = 2;
+
+ return 1;
+}
+__setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
+
static __net_init int sysctl_core_net_init(struct net *net)
{
struct ctl_table *tbl;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 48aba933a5a8..3a8c9ab4ecbe 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -158,7 +158,7 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
/* Returns 0 on success, negative on failure */
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
- struct net_device *dev, u32 queue_index)
+ struct net_device *dev, u32 queue_index, unsigned int napi_id)
{
if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
WARN(1, "Driver promised not to register this");
@@ -179,6 +179,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
xdp_rxq_info_init(xdp_rxq);
xdp_rxq->dev = dev;
xdp_rxq->queue_index = queue_index;
+ xdp_rxq->napi_id = napi_id;
xdp_rxq->reg_state = REG_STATE_REGISTERED;
return 0;
@@ -335,11 +336,10 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
* scenarios (e.g. queue full), it is possible to return the xdp_frame
* while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
- * of xdp_frames/pages in those cases. This path is never used by the
- * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of
- * the switch-statement.
+ * of xdp_frames/pages in those cases.
*/
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+ struct xdp_buff *xdp)
{
struct xdp_mem_allocator *xa;
struct page *page;
@@ -361,6 +361,10 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
page = virt_to_page(data); /* Assumes order0 page*/
put_page(page);
break;
+ case MEM_TYPE_XSK_BUFF_POOL:
+ /* NB! Only valid from an xdp_buff! */
+ xsk_buff_free(xdp);
+ break;
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
@@ -370,19 +374,73 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
void xdp_return_frame(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, false);
+ __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, true);
+ __xdp_return(xdpf->data, &xdpf->mem, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+/* XDP bulk APIs introduce a defer/flush mechanism to return
+ * pages belonging to the same xdp_mem_allocator object
+ * (identified via the mem.id field) in bulk to optimize
+ * I-cache and D-cache.
+ * The bulk queue size is set to 16 to be aligned to how
+ * XDP_REDIRECT bulking works. The bulk is flushed when
+ * it is full or when mem.id changes.
+ * xdp_frame_bulk is usually stored/allocated on the function
+ * call-stack to avoid locking penalties.
+ */
+void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
+{
+ struct xdp_mem_allocator *xa = bq->xa;
+
+ if (unlikely(!xa || !bq->count))
+ return;
+
+ page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count);
+ /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */
+ bq->count = 0;
+}
+EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
+
+/* Must be called with rcu_read_lock held */
+void xdp_return_frame_bulk(struct xdp_frame *xdpf,
+ struct xdp_frame_bulk *bq)
+{
+ struct xdp_mem_info *mem = &xdpf->mem;
+ struct xdp_mem_allocator *xa;
+
+ if (mem->type != MEM_TYPE_PAGE_POOL) {
+ __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
+ return;
+ }
+
+ xa = bq->xa;
+ if (unlikely(!xa)) {
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ bq->count = 0;
+ bq->xa = xa;
+ }
+
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
+ xdp_flush_frame_bulk(bq);
+
+ if (unlikely(mem->id != xa->mem.id)) {
+ xdp_flush_frame_bulk(bq);
+ bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ }
+
+ bq->q[bq->count++] = xdpf->data;
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
+
void xdp_return_buff(struct xdp_buff *xdp)
{
- __xdp_return(xdp->data, &xdp->rxq->mem, true);
+ __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
}
/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
@@ -400,18 +458,6 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
}
EXPORT_SYMBOL_GPL(__xdp_release_frame);
-bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
- struct netdev_bpf *bpf)
-{
- if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) {
- NL_SET_ERR_MSG(bpf->extack,
- "program loaded with different flags");
- return false;
- }
- return true;
-}
-EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok);
-
void xdp_attachment_setup(struct xdp_attachment_info *info,
struct netdev_bpf *bpf)
{
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 16014ad19406..653e3bc9c87b 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1765,6 +1765,8 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
fn = &reply_funcs[dcb->cmd];
if (!fn->cb)
return -EOPNOTSUPP;
+ if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
if (!tb[DCB_ATTR_IFNAME])
return -EINVAL;
@@ -1827,6 +1829,8 @@ static int dcb_app_add(const struct dcb_app *app, int ifindex)
/**
* dcb_getapp - retrieve the DCBX application user priority
+ * @dev: network interface
+ * @app: application to get user priority of
*
* On success returns a non-zero 802.1p user priority bitmap
* otherwise returns 0 as the invalid user priority bitmap to
@@ -1849,6 +1853,8 @@ EXPORT_SYMBOL(dcb_getapp);
/**
* dcb_setapp - add CEE dcb application data to app list
+ * @dev: network interface
+ * @new: application data to add
*
* Priority 0 is an invalid priority in CEE spec. This routine
* removes applications from the app list if the priority is
@@ -1890,6 +1896,8 @@ EXPORT_SYMBOL(dcb_setapp);
/**
* dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority
+ * @dev: network interface
+ * @app: where to store the retrieve application data
*
* Helper routine which on success returns a non-zero 802.1Qaz user
* priority bitmap otherwise returns 0 to indicate the dcb_app was
@@ -1912,6 +1920,8 @@ EXPORT_SYMBOL(dcb_ieee_getapp_mask);
/**
* dcb_ieee_setapp - add IEEE dcb application data to app list
+ * @dev: network interface
+ * @new: application data to add
*
* This adds Application data to the list. Multiple application
* entries may exists for the same selector and protocol as long
@@ -1946,6 +1956,8 @@ EXPORT_SYMBOL(dcb_ieee_setapp);
/**
* dcb_ieee_delapp - delete IEEE dcb application data from list
+ * @dev: network interface
+ * @del: application data to delete
*
* This removes a matching APP data from the APP list
*/
@@ -1975,7 +1987,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
}
EXPORT_SYMBOL(dcb_ieee_delapp);
-/**
+/*
* dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from
* priorities to the DSCP values assigned to that priority. Initialize p_map
* such that each map element holds a bit mask of DSCP values configured for
@@ -2004,7 +2016,7 @@ void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev,
}
EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map);
-/**
+/*
* dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from
* DSCP values to the priorities assigned to that DSCP value. Initialize p_map
* such that each map element holds a bit mask of priorities configured for a
@@ -2031,7 +2043,7 @@ dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev,
}
EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map);
-/**
+/*
* Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet
* type, with valid PID values >= 1536. A special meaning is then assigned to
* protocol value of 0: "default priority. For use when priority is not
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 0a72510d5de1..c4bbac99740d 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -242,6 +242,8 @@ static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
/**
* dccp_ackvec_input - Register incoming packet in the buffer
+ * @av: Ack Vector to register packet to
+ * @skb: Packet to register
*/
void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
{
@@ -273,8 +275,11 @@ void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
/**
* dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
+ * @av: Ack Vector record to clean
+ * @ackno: last Ack Vector which has been acknowledged
+ *
* This routine is called when the peer acknowledges the receipt of Ack Vectors
- * up to and including @ackno. While based on on section A.3 of RFC 4340, here
+ * up to and including @ackno. While based on section A.3 of RFC 4340, here
* are additional precautions to prevent corrupted buffer state. In particular,
* we use tail_ackno to identify outdated records; it always marks the earliest
* packet of group (2) in 11.4.2.
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 1e9bb121ba72..6beac5d348e2 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -76,7 +76,7 @@ int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
return err;
}
-static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...)
+static __printf(3, 4) struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...)
{
struct kmem_cache *slab;
va_list args;
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 3da1f77bd039..4d9823d6dced 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -181,6 +181,9 @@ MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation");
/**
* ccid2_update_used_window - Track how much of cwnd is actually used
+ * @hc: socket to update window
+ * @new_wnd: new window values to add into the filter
+ *
* This is done in addition to CWV. The sender needs to have an idea of how many
* packets may be in flight, to set the local Sequence Window value accordingly
* (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the
@@ -349,6 +352,8 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
/**
* ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
+ * @sk: socket to perform estimator on
+ *
* This code is almost identical with TCP's tcp_rtt_estimator(), since
* - it has a higher sampling frequency (recommended by RFC 1323),
* - the RTO does not collapse into RTT due to RTTVAR going towards zero,
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index b9ee1a4a8955..ca8670f78ac6 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -79,6 +79,8 @@ static inline u64 rfc3390_initial_rate(struct sock *sk)
/**
* ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
+ * @hc: socket to have the send interval updated
+ *
* This respects the granularity of X_inst (64 * bytes/second).
*/
static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
@@ -99,6 +101,7 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
/**
* ccid3_hc_tx_update_x - Update allowed sending rate X
+ * @sk: socket to be updated
* @stamp: most recent time if available - can be left NULL.
*
* This function tracks draft rfc3448bis, check there for latest details.
@@ -151,6 +154,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
/**
* ccid3_hc_tx_update_s - Track the mean packet size `s'
+ * @hc: socket to be updated
* @len: DCCP packet payload size in bytes
*
* cf. RFC 4342, 5.3 and RFC 3448, 4.1
@@ -259,6 +263,7 @@ out:
/**
* ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
+ * @sk: socket to send packet from
* @skb: next packet candidate to send on @sk
*
* This function uses the convention of ccid_packet_dequeue_eval() and
@@ -655,6 +660,7 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
/**
* ccid3_first_li - Implements [RFC 5348, 6.3.1]
+ * @sk: socket to calculate loss interval for
*
* Determine the length of the first loss interval via inverse lookup.
* Assume that X_recv can be computed by the throughput equation
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 67abad695e66..da95319842bb 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -79,6 +79,9 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
/**
* tfrc_lh_update_i_mean - Update the `open' loss interval I_0
+ * @lh: histogram to update
+ * @skb: received socket triggering loss interval update
+ *
* For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev
*/
u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index af08e2df7108..0cdda3c66fb5 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -385,6 +385,9 @@ static inline struct tfrc_rx_hist_entry *
/**
* tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
+ * @h: receive histogram
+ * @skb: packet containing timestamp.
+ *
* Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
* to compute a sample with given data - calling function should check this.
*/
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 788dd629c420..305f56804832 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -996,6 +996,8 @@ int dccp_feat_finalise_settings(struct dccp_sock *dp)
/**
* dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features
+ * @dreq: server socket to resolve
+ *
* It is the server which resolves the dependencies once the CCID has been
* fully negotiated. If no CCID has been negotiated, it uses the default CCID.
*/
@@ -1033,6 +1035,10 @@ static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
/**
* dccp_feat_prefer - Move preferred entry to the start of array
+ * @preferred_value: entry to move to start of array
+ * @array: array of preferred entries
+ * @array_len: size of the array
+ *
* Reorder the @array_len elements in @array so that @preferred_value comes
* first. Returns >0 to indicate that @preferred_value does occur in @array.
*/
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9c28c8251125..2455b0c0e486 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -427,7 +427,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL);
if (*own_req)
ireq->ireq_opt = NULL;
else
@@ -464,7 +464,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
.fl4_dport = dccp_hdr(skb)->dccph_sport,
};
- security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
@@ -495,7 +495,8 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- rcu_dereference(ireq->ireq_opt));
+ rcu_dereference(ireq->ireq_opt),
+ inet_sk(sk)->tos);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -537,7 +538,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
local_bh_disable();
bh_lock_sock(ctl_sk);
err = ip_build_and_send_pkt(skb, ctl_sk,
- rxiph->daddr, rxiph->saddr, NULL);
+ rxiph->daddr, rxiph->saddr, NULL,
+ inet_sk(ctl_sk)->tos);
bh_unlock_sock(ctl_sk);
if (net_xmit_eval(err) == 0) {
@@ -731,7 +733,7 @@ int dccp_invalid_packet(struct sk_buff *skb)
return 1;
}
/*
- * If P.Data Offset is too too large for packet, drop packet and return
+ * If P.Data Offset is too large for packet, drop packet and return
*/
if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) {
DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ef4ab28cfde0..1f73603913f5 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -203,7 +203,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
fl6.flowi6_oif = ireq->ir_iif;
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = htons(ireq->ir_num);
- security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+ security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
rcu_read_lock();
@@ -279,7 +279,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
fl6.flowi6_oif = inet6_iif(rxskb);
fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
- security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6));
/* sk = NULL, but it is safe for now. RST socket required. */
dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
@@ -533,7 +533,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
dccp_done(newsk);
goto out;
}
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL);
/* Clone pktoptions received with SYN, if we own the req */
if (*own_req && ireq->pktopts) {
newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
@@ -907,7 +907,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.flowi6_oif = sk->sk_bound_dev_if;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 50e6d5699bb2..b8a24734385e 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -143,6 +143,8 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
/**
* dccp_determine_ccmps - Find out about CCID-specific packet-size limits
+ * @dp: socket to find packet size limits of
+ *
* We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
* since the RX CCID is restricted to feedback packets (Acks), which are small
* in comparison with the data traffic. A value of 0 means "no current CCMPS".
@@ -236,6 +238,8 @@ static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
/**
* dccp_xmit_packet - Send data packet under control of CCID
+ * @sk: socket to send data packet on
+ *
* Transmits next-queued payload and informs CCID to account for the packet.
*/
static void dccp_xmit_packet(struct sock *sk)
@@ -296,6 +300,9 @@ static void dccp_xmit_packet(struct sock *sk)
/**
* dccp_flush_write_queue - Drain queue at end of connection
+ * @sk: socket to be drained
+ * @time_budget: time allowed to drain the queue
+ *
* Since dccp_sendmsg queues packets without waiting for them to be sent, it may
* happen that the TX queue is not empty at the end of a connection. We give the
* HC-sender CCID a grace period of up to @time_budget jiffies. If this function
@@ -367,6 +374,8 @@ void dccp_write_xmit(struct sock *sk)
/**
* dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets
+ * @sk: socket to perform retransmit on
+ *
* There are only four retransmittable packet types in DCCP:
* - Request in client-REQUEST state (sec. 8.1.1),
* - CloseReq in server-CLOSEREQ state (sec. 8.3),
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
index db2448c33a62..5ba204ec0aca 100644
--- a/net/dccp/qpolicy.c
+++ b/net/dccp/qpolicy.c
@@ -65,14 +65,16 @@ static bool qpolicy_prio_full(struct sock *sk)
* @push: add a new @skb to the write queue
* @full: indicates that no more packets will be admitted
* @top: peeks at whatever the queueing policy defines as its `top'
+ * @params: parameter passed to policy operation
*/
-static struct dccp_qpolicy_operations {
+struct dccp_qpolicy_operations {
void (*push) (struct sock *sk, struct sk_buff *skb);
bool (*full) (struct sock *sk);
struct sk_buff* (*top) (struct sock *sk);
__be32 params;
+};
-} qpol_table[DCCPQ_POLICY_MAX] = {
+static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = {
[DCCPQ_POLICY_SIMPLE] = {
.push = qpolicy_simple_push,
.full = qpolicy_simple_full,
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 0e06dfc32273..db768f223ef7 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -85,7 +85,7 @@ static void dccp_retransmit_timer(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
/*
- * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
+ * More than 4MSL (8 minutes) has passed, a RESET(aborted) was
* sent, no need to retransmit, this sock is dead.
*/
if (dccp_write_timeout(sk))
@@ -176,7 +176,6 @@ static void dccp_delack_timer(struct timer_list *t)
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
- icsk->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
sk_reset_timer(sk, &icsk->icsk_delack_timer,
jiffies + TCP_DELACK_MIN);
@@ -216,13 +215,14 @@ out:
/**
* dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
- * @data: Socket to act on
+ * @t: pointer to the tasklet associated with this handler
*
* See the comments above %ccid_dequeueing_decision for supported modes.
*/
-static void dccp_write_xmitlet(unsigned long data)
+static void dccp_write_xmitlet(struct tasklet_struct *t)
{
- struct sock *sk = (struct sock *)data;
+ struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet);
+ struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk;
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
@@ -236,16 +236,15 @@ static void dccp_write_xmitlet(unsigned long data)
static void dccp_write_xmit_timer(struct timer_list *t)
{
struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer);
- struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk;
- dccp_write_xmitlet((unsigned long)sk);
+ dccp_write_xmitlet(&dp->dccps_xmitlet);
}
void dccp_init_xmit_timers(struct sock *sk)
{
struct dccp_sock *dp = dccp_sk(sk);
- tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
+ tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet);
timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0);
inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
&dccp_keepalive_timer);
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 15d42353f1a3..d1c50a48614b 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -658,7 +658,7 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
ifa->ifa_dev = dn_db;
if (tb[IFA_LABEL])
- nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+ nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 4cac31d22a50..2193ae529e75 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1035,7 +1035,7 @@ source_ok:
fld.saddr = dnet_select_source(dev_out, 0,
RT_SCOPE_HOST);
if (!fld.daddr)
- goto out;
+ goto done;
}
fld.flowidn_oif = LOOPBACK_IFINDEX;
res.type = RTN_LOCAL;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 1f9b9b11008c..dfecd7b22fd7 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -56,20 +56,31 @@ config NET_DSA_TAG_BRCM_PREPEND
Broadcom switches which places the tag before the Ethernet header
(prepended).
+config NET_DSA_TAG_HELLCREEK
+ tristate "Tag driver for Hirschmann Hellcreek TSN switches"
+ help
+ Say Y or M if you want to enable support for tagging frames
+ for the Hirschmann Hellcreek TSN switches.
+
config NET_DSA_TAG_GSWIP
tristate "Tag driver for Lantiq / Intel GSWIP switches"
help
Say Y or M if you want to enable support for tagging frames for the
Lantiq / Intel GSWIP switches.
+config NET_DSA_TAG_DSA_COMMON
+ tristate
+
config NET_DSA_TAG_DSA
tristate "Tag driver for Marvell switches using DSA headers"
+ select NET_DSA_TAG_DSA_COMMON
help
Say Y or M if you want to enable support for tagging frames for the
Marvell switches which use DSA headers.
config NET_DSA_TAG_EDSA
tristate "Tag driver for Marvell switches using EtherType DSA headers"
+ select NET_DSA_TAG_DSA_COMMON
help
Say Y or M if you want to enable support for tagging frames for the
Marvell switches which use EtherType DSA headers.
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 4f47b2025ff5..0fb2b75a7ae3 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -7,9 +7,9 @@ dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o
obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o
obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o
obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o
-obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
-obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
+obj-$(CONFIG_NET_DSA_TAG_DSA_COMMON) += tag_dsa.o
obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o
+obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o
obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o
obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 1ce9ba8cf545..a1b1dc8a4d87 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -201,7 +201,6 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
{
struct dsa_port *cpu_dp = dev->dsa_ptr;
struct sk_buff *nskb = NULL;
- struct pcpu_sw_netstats *s;
struct dsa_slave_priv *p;
if (unlikely(!cpu_dp)) {
@@ -225,11 +224,16 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, skb->dev);
- s = this_cpu_ptr(p->stats64);
- u64_stats_update_begin(&s->syncp);
- s->rx_packets++;
- s->rx_bytes += skb->len;
- u64_stats_update_end(&s->syncp);
+ if (unlikely(cpu_dp->ds->untag_bridge_pvid)) {
+ nskb = dsa_untag_bridge_pvid(skb);
+ if (!nskb) {
+ kfree_skb(skb);
+ return 0;
+ }
+ skb = nskb;
+ }
+
+ dev_sw_netstats_rx_add(skb->dev, skb->len);
if (dsa_skb_defer_rx_timestamp(p, skb))
return 0;
@@ -330,11 +334,7 @@ EXPORT_SYMBOL_GPL(call_dsa_notifiers);
int dsa_devlink_param_get(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
{
- struct dsa_devlink_priv *dl_priv;
- struct dsa_switch *ds;
-
- dl_priv = devlink_priv(dl);
- ds = dl_priv->ds;
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
if (!ds->ops->devlink_param_get)
return -EOPNOTSUPP;
@@ -346,11 +346,7 @@ EXPORT_SYMBOL_GPL(dsa_devlink_param_get);
int dsa_devlink_param_set(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
{
- struct dsa_devlink_priv *dl_priv;
- struct dsa_switch *ds;
-
- dl_priv = devlink_priv(dl);
- ds = dl_priv->ds;
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
if (!ds->ops->devlink_param_set)
return -EOPNOTSUPP;
@@ -412,6 +408,36 @@ void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
}
EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister);
+struct devlink_region *
+dsa_devlink_region_create(struct dsa_switch *ds,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ return devlink_region_create(ds->devlink, ops, region_max_snapshots,
+ region_size);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_region_create);
+
+struct devlink_region *
+dsa_devlink_port_region_create(struct dsa_switch *ds,
+ int port,
+ const struct devlink_port_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+
+ return devlink_port_region_create(&dp->devlink_port, ops,
+ region_max_snapshots,
+ region_size);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_port_region_create);
+
+void dsa_devlink_region_destroy(struct devlink_region *region)
+{
+ devlink_region_destroy(region);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_region_destroy);
+
struct dsa_port *dsa_port_from_netdev(struct net_device *netdev)
{
if (!netdev || !dsa_slave_dev_check(netdev))
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index c0ffc7a2b65f..a04fd637b4cd 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -21,9 +21,6 @@
static DEFINE_MUTEX(dsa2_mutex);
LIST_HEAD(dsa_tree_list);
-static const struct devlink_ops dsa_devlink_ops = {
-};
-
struct dsa_switch *dsa_switch_find(int tree_index, int sw_index)
{
struct dsa_switch_tree *dst;
@@ -254,22 +251,11 @@ static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst)
static int dsa_port_setup(struct dsa_port *dp)
{
- struct dsa_switch *ds = dp->ds;
- struct dsa_switch_tree *dst = ds->dst;
- const unsigned char *id = (const unsigned char *)&dst->index;
- const unsigned char len = sizeof(dst->index);
struct devlink_port *dlp = &dp->devlink_port;
bool dsa_port_link_registered = false;
- bool devlink_port_registered = false;
- struct devlink_port_attrs attrs = {};
- struct devlink *dl = ds->devlink;
bool dsa_port_enabled = false;
int err = 0;
- attrs.phys.port_number = dp->index;
- memcpy(attrs.switch_id.id, id, len);
- attrs.switch_id.id_len = len;
-
if (dp->setup)
return 0;
@@ -278,14 +264,6 @@ static int dsa_port_setup(struct dsa_port *dp)
dsa_port_disable(dp);
break;
case DSA_PORT_TYPE_CPU:
- memset(dlp, 0, sizeof(*dlp));
- attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU;
- devlink_port_attrs_set(dlp, &attrs);
- err = devlink_port_register(dl, dlp, dp->index);
- if (err)
- break;
- devlink_port_registered = true;
-
err = dsa_port_link_register_of(dp);
if (err)
break;
@@ -298,14 +276,6 @@ static int dsa_port_setup(struct dsa_port *dp)
break;
case DSA_PORT_TYPE_DSA:
- memset(dlp, 0, sizeof(*dlp));
- attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA;
- devlink_port_attrs_set(dlp, &attrs);
- err = devlink_port_register(dl, dlp, dp->index);
- if (err)
- break;
- devlink_port_registered = true;
-
err = dsa_port_link_register_of(dp);
if (err)
break;
@@ -318,14 +288,6 @@ static int dsa_port_setup(struct dsa_port *dp)
break;
case DSA_PORT_TYPE_USER:
- memset(dlp, 0, sizeof(*dlp));
- attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
- devlink_port_attrs_set(dlp, &attrs);
- err = devlink_port_register(dl, dlp, dp->index);
- if (err)
- break;
- devlink_port_registered = true;
-
dp->mac = of_get_mac_address(dp->dn);
err = dsa_slave_create(dp);
if (err)
@@ -339,8 +301,6 @@ static int dsa_port_setup(struct dsa_port *dp)
dsa_port_disable(dp);
if (err && dsa_port_link_registered)
dsa_port_link_unregister_of(dp);
- if (err && devlink_port_registered)
- devlink_port_unregister(dlp);
if (err)
return err;
@@ -349,6 +309,48 @@ static int dsa_port_setup(struct dsa_port *dp)
return 0;
}
+static int dsa_port_devlink_setup(struct dsa_port *dp)
+{
+ struct devlink_port *dlp = &dp->devlink_port;
+ struct dsa_switch_tree *dst = dp->ds->dst;
+ struct devlink_port_attrs attrs = {};
+ struct devlink *dl = dp->ds->devlink;
+ const unsigned char *id;
+ unsigned char len;
+ int err;
+
+ id = (const unsigned char *)&dst->index;
+ len = sizeof(dst->index);
+
+ attrs.phys.port_number = dp->index;
+ memcpy(attrs.switch_id.id, id, len);
+ attrs.switch_id.id_len = len;
+ memset(dlp, 0, sizeof(*dlp));
+
+ switch (dp->type) {
+ case DSA_PORT_TYPE_UNUSED:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED;
+ break;
+ case DSA_PORT_TYPE_CPU:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU;
+ break;
+ case DSA_PORT_TYPE_DSA:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA;
+ break;
+ case DSA_PORT_TYPE_USER:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
+ break;
+ }
+
+ devlink_port_attrs_set(dlp, &attrs);
+ err = devlink_port_register(dl, dlp, dp->index);
+
+ if (!err)
+ dp->devlink_port_setup = true;
+
+ return err;
+}
+
static void dsa_port_teardown(struct dsa_port *dp)
{
struct devlink_port *dlp = &dp->devlink_port;
@@ -356,22 +358,21 @@ static void dsa_port_teardown(struct dsa_port *dp)
if (!dp->setup)
return;
+ devlink_port_type_clear(dlp);
+
switch (dp->type) {
case DSA_PORT_TYPE_UNUSED:
break;
case DSA_PORT_TYPE_CPU:
dsa_port_disable(dp);
dsa_tag_driver_put(dp->tag_ops);
- devlink_port_unregister(dlp);
dsa_port_link_unregister_of(dp);
break;
case DSA_PORT_TYPE_DSA:
dsa_port_disable(dp);
- devlink_port_unregister(dlp);
dsa_port_link_unregister_of(dp);
break;
case DSA_PORT_TYPE_USER:
- devlink_port_unregister(dlp);
if (dp->slave) {
dsa_slave_destroy(dp->slave);
dp->slave = NULL;
@@ -382,9 +383,35 @@ static void dsa_port_teardown(struct dsa_port *dp)
dp->setup = false;
}
+static void dsa_port_devlink_teardown(struct dsa_port *dp)
+{
+ struct devlink_port *dlp = &dp->devlink_port;
+
+ if (dp->devlink_port_setup)
+ devlink_port_unregister(dlp);
+ dp->devlink_port_setup = false;
+}
+
+static int dsa_devlink_info_get(struct devlink *dl,
+ struct devlink_info_req *req,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (ds->ops->devlink_info_get)
+ return ds->ops->devlink_info_get(ds, req, extack);
+
+ return -EOPNOTSUPP;
+}
+
+static const struct devlink_ops dsa_devlink_ops = {
+ .info_get = dsa_devlink_info_get,
+};
+
static int dsa_switch_setup(struct dsa_switch *ds)
{
struct dsa_devlink_priv *dl_priv;
+ struct dsa_port *dp;
int err;
if (ds->setup)
@@ -410,9 +437,20 @@ static int dsa_switch_setup(struct dsa_switch *ds)
if (err)
goto free_devlink;
+ /* Setup devlink port instances now, so that the switch
+ * setup() can register regions etc, against the ports
+ */
+ list_for_each_entry(dp, &ds->dst->ports, list) {
+ if (dp->ds == ds) {
+ err = dsa_port_devlink_setup(dp);
+ if (err)
+ goto unregister_devlink_ports;
+ }
+ }
+
err = dsa_switch_register_notifier(ds);
if (err)
- goto unregister_devlink;
+ goto unregister_devlink_ports;
err = ds->ops->setup(ds);
if (err < 0)
@@ -424,23 +462,29 @@ static int dsa_switch_setup(struct dsa_switch *ds)
ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
if (!ds->slave_mii_bus) {
err = -ENOMEM;
- goto unregister_notifier;
+ goto teardown;
}
dsa_slave_mii_bus_init(ds);
err = mdiobus_register(ds->slave_mii_bus);
if (err < 0)
- goto unregister_notifier;
+ goto teardown;
}
ds->setup = true;
return 0;
+teardown:
+ if (ds->ops->teardown)
+ ds->ops->teardown(ds);
unregister_notifier:
dsa_switch_unregister_notifier(ds);
-unregister_devlink:
+unregister_devlink_ports:
+ list_for_each_entry(dp, &ds->dst->ports, list)
+ if (dp->ds == ds)
+ dsa_port_devlink_teardown(dp);
devlink_unregister(ds->devlink);
free_devlink:
devlink_free(ds->devlink);
@@ -451,6 +495,8 @@ free_devlink:
static void dsa_switch_teardown(struct dsa_switch *ds)
{
+ struct dsa_port *dp;
+
if (!ds->setup)
return;
@@ -463,6 +509,9 @@ static void dsa_switch_teardown(struct dsa_switch *ds)
ds->ops->teardown(ds);
if (ds->devlink) {
+ list_for_each_entry(dp, &ds->dst->ports, list)
+ if (dp->ds == ds)
+ dsa_port_devlink_teardown(dp);
devlink_unregister(ds->devlink);
devlink_free(ds->devlink);
ds->devlink = NULL;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 1653e3377cb3..7c96aae9062c 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -7,6 +7,7 @@
#ifndef __DSA_PRIV_H
#define __DSA_PRIV_H
+#include <linux/if_bridge.h>
#include <linux/phy.h>
#include <linux/netdevice.h>
#include <linux/netpoll.h>
@@ -77,8 +78,6 @@ struct dsa_slave_priv {
struct sk_buff * (*xmit)(struct sk_buff *skb,
struct net_device *dev);
- struct pcpu_sw_netstats __percpu *stats64;
-
struct gro_cells gcells;
/* DSA port data, such as switch, port index, etc. */
@@ -164,8 +163,6 @@ int dsa_port_vlan_add(struct dsa_port *dp,
struct switchdev_trans *trans);
int dsa_port_vlan_del(struct dsa_port *dp,
const struct switchdev_obj_port_vlan *vlan);
-int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags);
-int dsa_port_vid_del(struct dsa_port *dp, u16 vid);
int dsa_port_link_register_of(struct dsa_port *dp);
void dsa_port_link_unregister_of(struct dsa_port *dp);
extern const struct phylink_mac_ops dsa_port_phylink_mac_ops;
@@ -196,6 +193,65 @@ dsa_slave_to_master(const struct net_device *dev)
return dp->cpu_dp->master;
}
+/* If under a bridge with vlan_filtering=0, make sure to send pvid-tagged
+ * frames as untagged, since the bridge will not untag them.
+ */
+static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb)
+{
+ struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+ struct net_device *br = dp->bridge_dev;
+ struct net_device *dev = skb->dev;
+ struct net_device *upper_dev;
+ u16 vid, pvid, proto;
+ int err;
+
+ if (!br || br_vlan_enabled(br))
+ return skb;
+
+ err = br_vlan_get_proto(br, &proto);
+ if (err)
+ return skb;
+
+ /* Move VLAN tag from data to hwaccel */
+ if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) {
+ skb = skb_vlan_untag(skb);
+ if (!skb)
+ return NULL;
+ }
+
+ if (!skb_vlan_tag_present(skb))
+ return skb;
+
+ vid = skb_vlan_tag_get_id(skb);
+
+ /* We already run under an RCU read-side critical section since
+ * we are called from netif_receive_skb_list_internal().
+ */
+ err = br_vlan_get_pvid_rcu(dev, &pvid);
+ if (err)
+ return skb;
+
+ if (vid != pvid)
+ return skb;
+
+ /* The sad part about attempting to untag from DSA is that we
+ * don't know, unless we check, if the skb will end up in
+ * the bridge's data path - br_allowed_ingress() - or not.
+ * For example, there might be an 8021q upper for the
+ * default_pvid of the bridge, which will steal VLAN-tagged traffic
+ * from the bridge's data path. This is a configuration that DSA
+ * supports because vlan_filtering is 0. In that case, we should
+ * definitely keep the tag, to make sure it keeps working.
+ */
+ upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid);
+ if (upper_dev)
+ return skb;
+
+ __vlan_hwaccel_clear_tag(skb);
+
+ return skb;
+}
+
/* switch.c */
int dsa_switch_register_notifier(struct dsa_switch *ds);
void dsa_switch_unregister_notifier(struct dsa_switch *ds);
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 61615ebc70e9..cb3a5cf99b25 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -259,6 +259,18 @@ static void dsa_netdev_ops_set(struct net_device *dev,
dev->dsa_ptr->netdev_ops = ops;
}
+static void dsa_master_set_promiscuity(struct net_device *dev, int inc)
+{
+ const struct dsa_device_ops *ops = dev->dsa_ptr->tag_ops;
+
+ if (!ops->promisc_on_master)
+ return;
+
+ rtnl_lock();
+ dev_set_promiscuity(dev, inc);
+ rtnl_unlock();
+}
+
static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
char *buf)
{
@@ -296,14 +308,25 @@ static struct lock_class_key dsa_master_addr_list_lock_key;
int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
{
+ int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct device_link *consumer_link;
int ret;
+ /* The DSA master must use SET_NETDEV_DEV for this to work. */
+ consumer_link = device_link_add(ds->dev, dev->dev.parent,
+ DL_FLAG_AUTOREMOVE_CONSUMER);
+ if (!consumer_link)
+ netdev_err(dev,
+ "Failed to create a device link to DSA switch %s\n",
+ dev_name(ds->dev));
+
rtnl_lock();
- ret = dev_set_mtu(dev, ETH_DATA_LEN + cpu_dp->tag_ops->overhead);
+ ret = dev_set_mtu(dev, mtu);
rtnl_unlock();
if (ret)
- netdev_warn(dev, "error %d setting MTU to include DSA overhead\n",
- ret);
+ netdev_warn(dev, "error %d setting MTU to %d to include DSA overhead\n",
+ ret, mtu);
/* If we use a tagging format that doesn't have an ethertype
* field, make sure that all packets from this point on get
@@ -314,9 +337,12 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
dev->dsa_ptr = cpu_dp;
lockdep_set_class(&dev->addr_list_lock,
&dsa_master_addr_list_lock_key);
+
+ dsa_master_set_promiscuity(dev, 1);
+
ret = dsa_master_ethtool_setup(dev);
if (ret)
- return ret;
+ goto out_err_reset_promisc;
dsa_netdev_ops_set(dev, &dsa_netdev_ops);
@@ -329,6 +355,8 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
out_err_ndo_teardown:
dsa_netdev_ops_set(dev, NULL);
dsa_master_ethtool_teardown(dev);
+out_err_reset_promisc:
+ dsa_master_set_promiscuity(dev, -1);
return ret;
}
@@ -338,6 +366,7 @@ void dsa_master_teardown(struct net_device *dev)
dsa_netdev_ops_set(dev, NULL);
dsa_master_ethtool_teardown(dev);
dsa_master_reset_mtu(dev);
+ dsa_master_set_promiscuity(dev, -1);
dev->dsa_ptr = NULL;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index e23ece229c7e..73569c9af3cc 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -193,11 +193,44 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br)
dsa_port_set_state_now(dp, BR_STATE_FORWARDING);
}
+/* Must be called under rcu_read_lock() */
static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp,
bool vlan_filtering)
{
struct dsa_switch *ds = dp->ds;
- int i;
+ int err, i;
+
+ /* VLAN awareness was off, so the question is "can we turn it on".
+ * We may have had 8021q uppers, those need to go. Make sure we don't
+ * enter an inconsistent state: deny changing the VLAN awareness state
+ * as long as we have 8021q uppers.
+ */
+ if (vlan_filtering && dsa_is_user_port(ds, dp->index)) {
+ struct net_device *upper_dev, *slave = dp->slave;
+ struct net_device *br = dp->bridge_dev;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) {
+ struct bridge_vlan_info br_info;
+ u16 vid;
+
+ if (!is_vlan_dev(upper_dev))
+ continue;
+
+ vid = vlan_dev_vlan_id(upper_dev);
+
+ /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
+ * device, respectively the VID is not found, returning
+ * 0 means success, which is a failure for us here.
+ */
+ err = br_vlan_get_info(br, vid, &br_info);
+ if (err == 0) {
+ dev_err(ds->dev, "Must remove upper %s first\n",
+ upper_dev->name);
+ return false;
+ }
+ }
+ }
if (!ds->vlan_filtering_is_global)
return true;
@@ -232,28 +265,38 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
struct dsa_switch *ds = dp->ds;
int err;
- /* bridge skips -EOPNOTSUPP, so skip the prepare phase */
- if (switchdev_trans_ph_prepare(trans))
- return 0;
+ if (switchdev_trans_ph_prepare(trans)) {
+ bool apply;
- if (!ds->ops->port_vlan_filtering)
- return 0;
+ if (!ds->ops->port_vlan_filtering)
+ return -EOPNOTSUPP;
- if (!dsa_port_can_apply_vlan_filtering(dp, vlan_filtering))
- return -EINVAL;
+ /* We are called from dsa_slave_switchdev_blocking_event(),
+ * which is not under rcu_read_lock(), unlike
+ * dsa_slave_switchdev_event().
+ */
+ rcu_read_lock();
+ apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering);
+ rcu_read_unlock();
+ if (!apply)
+ return -EINVAL;
+ }
if (dsa_port_is_vlan_filtering(dp) == vlan_filtering)
return 0;
- err = ds->ops->port_vlan_filtering(ds, dp->index,
- vlan_filtering);
+ err = ds->ops->port_vlan_filtering(ds, dp->index, vlan_filtering,
+ trans);
if (err)
return err;
- if (ds->vlan_filtering_is_global)
- ds->vlan_filtering = vlan_filtering;
- else
- dp->vlan_filtering = vlan_filtering;
+ if (switchdev_trans_ph_commit(trans)) {
+ if (ds->vlan_filtering_is_global)
+ ds->vlan_filtering = vlan_filtering;
+ else
+ dp->vlan_filtering = vlan_filtering;
+ }
+
return 0;
}
@@ -433,39 +476,6 @@ int dsa_port_vlan_del(struct dsa_port *dp,
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
}
-int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags)
-{
- struct switchdev_obj_port_vlan vlan = {
- .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .flags = flags,
- .vid_begin = vid,
- .vid_end = vid,
- };
- struct switchdev_trans trans;
- int err;
-
- trans.ph_prepare = true;
- err = dsa_port_vlan_add(dp, &vlan, &trans);
- if (err)
- return err;
-
- trans.ph_prepare = false;
- return dsa_port_vlan_add(dp, &vlan, &trans);
-}
-EXPORT_SYMBOL(dsa_port_vid_add);
-
-int dsa_port_vid_del(struct dsa_port *dp, u16 vid)
-{
- struct switchdev_obj_port_vlan vlan = {
- .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .vid_begin = vid,
- .vid_end = vid,
- };
-
- return dsa_port_vlan_del(dp, &vlan);
-}
-EXPORT_SYMBOL(dsa_port_vid_del);
-
static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
{
struct device_node *phy_dn;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 16e5f98d4882..4a0498bf6c65 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -303,13 +303,36 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
return ret;
}
+/* Must be called under rcu_read_lock() */
+static int
+dsa_slave_vlan_check_for_8021q_uppers(struct net_device *slave,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct net_device *upper_dev;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) {
+ u16 vid;
+
+ if (!is_vlan_dev(upper_dev))
+ continue;
+
+ vid = vlan_dev_vlan_id(upper_dev);
+ if (vid >= vlan->vid_begin && vid <= vlan->vid_end)
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
static int dsa_slave_vlan_add(struct net_device *dev,
const struct switchdev_obj *obj,
struct switchdev_trans *trans)
{
+ struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
struct switchdev_obj_port_vlan vlan;
- int err;
+ int vid, err;
if (obj->orig_dev != dev)
return -EOPNOTSUPP;
@@ -319,6 +342,17 @@ static int dsa_slave_vlan_add(struct net_device *dev,
vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
+ /* Deny adding a bridge VLAN when there is already an 802.1Q upper with
+ * the same VID.
+ */
+ if (trans->ph_prepare && br_vlan_enabled(dp->bridge_dev)) {
+ rcu_read_lock();
+ err = dsa_slave_vlan_check_for_8021q_uppers(dev, &vlan);
+ rcu_read_unlock();
+ if (err)
+ return err;
+ }
+
err = dsa_port_vlan_add(dp, &vlan, trans);
if (err)
return err;
@@ -333,6 +367,12 @@ static int dsa_slave_vlan_add(struct net_device *dev,
if (err)
return err;
+ for (vid = vlan.vid_begin; vid <= vlan.vid_end; vid++) {
+ err = vlan_vid_add(master, htons(ETH_P_8021Q), vid);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -376,7 +416,10 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
static int dsa_slave_vlan_del(struct net_device *dev,
const struct switchdev_obj *obj)
{
+ struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct switchdev_obj_port_vlan *vlan;
+ int vid, err;
if (obj->orig_dev != dev)
return -EOPNOTSUPP;
@@ -384,10 +427,19 @@ static int dsa_slave_vlan_del(struct net_device *dev,
if (dsa_port_skip_vlan_configuration(dp))
return 0;
+ vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
/* Do not deprogram the CPU port as it may be shared with other user
* ports which can be members of this VLAN as well.
*/
- return dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj));
+ err = dsa_port_vlan_del(dp, vlan);
+ if (err)
+ return err;
+
+ for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++)
+ vlan_vid_del(master, htons(ETH_P_8021Q), vid);
+
+ return 0;
}
static int dsa_slave_port_obj_del(struct net_device *dev,
@@ -470,10 +522,10 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p,
if (!clone)
return;
- DSA_SKB_CB(skb)->clone = clone;
-
- if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type))
+ if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) {
+ DSA_SKB_CB(skb)->clone = clone;
return;
+ }
kfree_skb(clone);
}
@@ -496,17 +548,36 @@ netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
}
EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
+static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)
+{
+ int needed_headroom = dev->needed_headroom;
+ int needed_tailroom = dev->needed_tailroom;
+
+ /* For tail taggers, we need to pad short frames ourselves, to ensure
+ * that the tail tag does not fail at its role of being at the end of
+ * the packet, once the master interface pads the frame. Account for
+ * that pad length here, and pad later.
+ */
+ if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
+ needed_tailroom += ETH_ZLEN - skb->len;
+ /* skb_headroom() returns unsigned int... */
+ needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
+ needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
+
+ if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
+ /* No reallocation needed, yay! */
+ return 0;
+
+ return pskb_expand_head(skb, needed_headroom, needed_tailroom,
+ GFP_ATOMIC);
+}
+
static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct pcpu_sw_netstats *s;
struct sk_buff *nskb;
- s = this_cpu_ptr(p->stats64);
- u64_stats_update_begin(&s->syncp);
- s->tx_packets++;
- s->tx_bytes += skb->len;
- u64_stats_update_end(&s->syncp);
+ dev_sw_netstats_tx_add(dev, 1, skb->len);
DSA_SKB_CB(skb)->clone = NULL;
@@ -515,6 +586,17 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
*/
dsa_skb_tx_timestamp(p, skb);
+ if (dsa_realloc_skb(skb, dev)) {
+ dev_kfree_skb_any(skb);
+ return NETDEV_TX_OK;
+ }
+
+ /* needed_tailroom should still be 'warm' in the cache line from
+ * dsa_realloc_skb(), which has also ensured that padding is safe.
+ */
+ if (dev->needed_tailroom)
+ eth_skb_pad(skb);
+
/* Transmit function may have to reallocate the original SKB,
* in which case it must have freed it. Only free it here on error.
*/
@@ -627,7 +709,6 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
uint64_t *data)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = dp->ds;
struct pcpu_sw_netstats *s;
unsigned int start;
@@ -636,7 +717,7 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
for_each_possible_cpu(i) {
u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
- s = per_cpu_ptr(p->stats64, i);
+ s = per_cpu_ptr(dev->tstats, i);
do {
start = u64_stats_fetch_begin_irq(&s->syncp);
tx_packets = s->tx_packets;
@@ -1165,34 +1246,6 @@ static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
return ds->ops->port_setup_tc(ds, dp->index, type, type_data);
}
-static void dsa_slave_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *stats)
-{
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct pcpu_sw_netstats *s;
- unsigned int start;
- int i;
-
- netdev_stats_to_stats64(stats, &dev->stats);
- for_each_possible_cpu(i) {
- u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
-
- s = per_cpu_ptr(p->stats64, i);
- do {
- start = u64_stats_fetch_begin_irq(&s->syncp);
- tx_packets = s->tx_packets;
- tx_bytes = s->tx_bytes;
- rx_packets = s->rx_packets;
- rx_bytes = s->rx_bytes;
- } while (u64_stats_fetch_retry_irq(&s->syncp, start));
-
- stats->tx_packets += tx_packets;
- stats->tx_bytes += tx_bytes;
- stats->rx_packets += rx_packets;
- stats->rx_bytes += rx_bytes;
- }
-}
-
static int dsa_slave_get_rxnfc(struct net_device *dev,
struct ethtool_rxnfc *nfc, u32 *rule_locs)
{
@@ -1232,64 +1285,66 @@ static int dsa_slave_get_ts_info(struct net_device *dev,
static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
u16 vid)
{
+ struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct bridge_vlan_info info;
+ struct switchdev_obj_port_vlan vlan = {
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .vid_begin = vid,
+ .vid_end = vid,
+ /* This API only allows programming tagged, non-PVID VIDs */
+ .flags = 0,
+ };
+ struct switchdev_trans trans;
int ret;
- /* Check for a possible bridge VLAN entry now since there is no
- * need to emulate the switchdev prepare + commit phase.
- */
- if (dp->bridge_dev) {
- if (dsa_port_skip_vlan_configuration(dp))
- return 0;
+ /* User port... */
+ trans.ph_prepare = true;
+ ret = dsa_port_vlan_add(dp, &vlan, &trans);
+ if (ret)
+ return ret;
- /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
- * device, respectively the VID is not found, returning
- * 0 means success, which is a failure for us here.
- */
- ret = br_vlan_get_info(dp->bridge_dev, vid, &info);
- if (ret == 0)
- return -EBUSY;
- }
+ trans.ph_prepare = false;
+ ret = dsa_port_vlan_add(dp, &vlan, &trans);
+ if (ret)
+ return ret;
- ret = dsa_port_vid_add(dp, vid, 0);
+ /* And CPU port... */
+ trans.ph_prepare = true;
+ ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &trans);
if (ret)
return ret;
- ret = dsa_port_vid_add(dp->cpu_dp, vid, 0);
+ trans.ph_prepare = false;
+ ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &trans);
if (ret)
return ret;
- return 0;
+ return vlan_vid_add(master, proto, vid);
}
static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
u16 vid)
{
+ struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct bridge_vlan_info info;
- int ret;
-
- /* Check for a possible bridge VLAN entry now since there is no
- * need to emulate the switchdev prepare + commit phase.
- */
- if (dp->bridge_dev) {
- if (dsa_port_skip_vlan_configuration(dp))
- return 0;
-
- /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
- * device, respectively the VID is not found, returning
- * 0 means success, which is a failure for us here.
- */
- ret = br_vlan_get_info(dp->bridge_dev, vid, &info);
- if (ret == 0)
- return -EBUSY;
- }
+ struct switchdev_obj_port_vlan vlan = {
+ .vid_begin = vid,
+ .vid_end = vid,
+ /* This API only allows programming tagged, non-PVID VIDs */
+ .flags = 0,
+ };
+ int err;
/* Do not deprogram the CPU port as it may be shared with other user
* ports which can be members of this VLAN as well.
*/
- return dsa_port_vid_del(dp, vid);
+ err = dsa_port_vlan_del(dp, &vlan);
+ if (err)
+ return err;
+
+ vlan_vid_del(master, proto, vid);
+
+ return 0;
}
struct dsa_hw_port {
@@ -1566,7 +1621,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
#endif
.ndo_get_phys_port_name = dsa_slave_get_phys_port_name,
.ndo_setup_tc = dsa_slave_setup_tc,
- .ndo_get_stats64 = dsa_slave_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_port_parent_id = dsa_slave_get_port_parent_id,
.ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid,
@@ -1756,6 +1811,16 @@ int dsa_slave_create(struct dsa_port *port)
slave_dev->netdev_ops = &dsa_slave_netdev_ops;
if (ds->ops->port_max_mtu)
slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
+ if (cpu_dp->tag_ops->tail_tag)
+ slave_dev->needed_tailroom = cpu_dp->tag_ops->overhead;
+ else
+ slave_dev->needed_headroom = cpu_dp->tag_ops->overhead;
+ /* Try to save one extra realloc later in the TX path (in the master)
+ * by also inheriting the master's needed headroom and tailroom.
+ * The 8021q driver also does this.
+ */
+ slave_dev->needed_headroom += master->needed_headroom;
+ slave_dev->needed_tailroom += master->needed_tailroom;
SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
@@ -1766,8 +1831,8 @@ int dsa_slave_create(struct dsa_port *port)
slave_dev->vlan_features = master->vlan_features;
p = netdev_priv(slave_dev);
- p->stats64 = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!p->stats64) {
+ slave_dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!slave_dev->tstats) {
free_netdev(slave_dev);
return -ENOMEM;
}
@@ -1784,16 +1849,17 @@ int dsa_slave_create(struct dsa_port *port)
rtnl_lock();
ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN);
rtnl_unlock();
- if (ret)
- dev_warn(ds->dev, "nonfatal error %d setting MTU on port %d\n",
- ret, port->index);
+ if (ret && ret != -EOPNOTSUPP)
+ dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n",
+ ret, ETH_DATA_LEN, port->index);
netif_carrier_off(slave_dev);
ret = dsa_slave_phy_setup(slave_dev);
if (ret) {
- netdev_err(master, "error %d setting up slave PHY for %s\n",
- ret, slave_dev->name);
+ netdev_err(slave_dev,
+ "error %d setting up PHY for tree %d, switch %d, port %d\n",
+ ret, ds->dst->index, ds->index, port->index);
goto out_gcells;
}
@@ -1828,7 +1894,7 @@ out_phy:
out_gcells:
gro_cells_destroy(&p->gcells);
out_free:
- free_percpu(p->stats64);
+ free_percpu(slave_dev->tstats);
free_netdev(slave_dev);
port->slave = NULL;
return ret;
@@ -1850,7 +1916,7 @@ void dsa_slave_destroy(struct net_device *slave_dev)
dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
phylink_destroy(dp->pl);
gro_cells_destroy(&p->gcells);
- free_percpu(p->stats64);
+ free_percpu(slave_dev->tstats);
free_netdev(slave_dev);
}
@@ -1880,9 +1946,9 @@ static int dsa_slave_changeupper(struct net_device *dev,
return err;
}
-static int dsa_slave_upper_vlan_check(struct net_device *dev,
- struct netdev_notifier_changeupper_info *
- info)
+static int
+dsa_prevent_bridging_8021q_upper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
{
struct netlink_ext_ack *ext_ack;
struct net_device *slave;
@@ -1912,14 +1978,68 @@ static int dsa_slave_upper_vlan_check(struct net_device *dev,
return NOTIFY_DONE;
}
+static int
+dsa_slave_check_8021q_upper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct net_device *br = dp->bridge_dev;
+ struct bridge_vlan_info br_info;
+ struct netlink_ext_ack *extack;
+ int err = NOTIFY_DONE;
+ u16 vid;
+
+ if (!br || !br_vlan_enabled(br))
+ return NOTIFY_DONE;
+
+ extack = netdev_notifier_info_to_extack(&info->info);
+ vid = vlan_dev_vlan_id(info->upper_dev);
+
+ /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
+ * device, respectively the VID is not found, returning
+ * 0 means success, which is a failure for us here.
+ */
+ err = br_vlan_get_info(br, vid, &br_info);
+ if (err == 0) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "This VLAN is already configured by the bridge");
+ return notifier_from_errno(-EBUSY);
+ }
+
+ return NOTIFY_DONE;
+}
+
static int dsa_slave_netdevice_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- if (event == NETDEV_CHANGEUPPER) {
+ switch (event) {
+ case NETDEV_PRECHANGEUPPER: {
+ struct netdev_notifier_changeupper_info *info = ptr;
+ struct dsa_switch *ds;
+ struct dsa_port *dp;
+ int err;
+
+ if (!dsa_slave_dev_check(dev))
+ return dsa_prevent_bridging_8021q_upper(dev, ptr);
+
+ dp = dsa_slave_to_port(dev);
+ ds = dp->ds;
+
+ if (ds->ops->port_prechangeupper) {
+ err = ds->ops->port_prechangeupper(ds, dp->index, info);
+ if (err)
+ return notifier_from_errno(err);
+ }
+
+ if (is_vlan_dev(info->upper_dev))
+ return dsa_slave_check_8021q_upper(dev, ptr);
+ break;
+ }
+ case NETDEV_CHANGEUPPER:
if (!dsa_slave_dev_check(dev))
- return dsa_slave_upper_vlan_check(dev, ptr);
+ return NOTIFY_DONE;
return dsa_slave_changeupper(dev, ptr);
}
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 86c8dc5c32a0..3fb362b6874e 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -139,8 +139,15 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
}
}
if (unset_vlan_filtering) {
- struct switchdev_trans trans = {0};
+ struct switchdev_trans trans;
+ trans.ph_prepare = true;
+ err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port),
+ false, &trans);
+ if (err && err != EOPNOTSUPP)
+ return err;
+
+ trans.ph_prepare = false;
err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port),
false, &trans);
if (err && err != EOPNOTSUPP)
@@ -232,43 +239,6 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds,
return 0;
}
-static int dsa_port_vlan_device_check(struct net_device *vlan_dev,
- int vlan_dev_vid,
- void *arg)
-{
- struct switchdev_obj_port_vlan *vlan = arg;
- u16 vid;
-
- for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) {
- if (vid == vlan_dev_vid)
- return -EBUSY;
- }
-
- return 0;
-}
-
-static int dsa_port_vlan_check(struct dsa_switch *ds, int port,
- const struct switchdev_obj_port_vlan *vlan)
-{
- const struct dsa_port *dp = dsa_to_port(ds, port);
- int err = 0;
-
- /* Device is not bridged, let it proceed with the VLAN device
- * creation.
- */
- if (!dp->bridge_dev)
- return err;
-
- /* dsa_slave_vlan_rx_{add,kill}_vid() cannot use the prepare phase and
- * already checks whether there is an overlapping bridge VLAN entry
- * with the same VID, so here we only need to check that if we are
- * adding a bridge VLAN entry there is not an overlapping VLAN device
- * claiming that VID.
- */
- return vlan_for_each(dp->slave, dsa_port_vlan_device_check,
- (void *)vlan);
-}
-
static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port,
struct dsa_notifier_vlan_info *info)
{
@@ -291,10 +261,6 @@ static int dsa_switch_vlan_prepare(struct dsa_switch *ds,
for (port = 0; port < ds->num_ports; port++) {
if (dsa_switch_vlan_match(ds, port, info)) {
- err = dsa_port_vlan_check(ds, port, info->vlan);
- if (err)
- return err;
-
err = ds->ops->port_vlan_prepare(ds, port, info->vlan);
if (err)
return err;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 780b2a15ac9b..8e3e8a5b8559 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -146,15 +146,15 @@ EXPORT_SYMBOL_GPL(vid_is_dsa_8021q);
* user explicitly configured this @vid through the bridge core, then the @vid
* is installed again, but this time with the flags from the bridge layer.
*/
-static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid,
+static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid,
u16 flags, bool enabled)
{
- struct dsa_port *dp = dsa_to_port(ds, port);
+ struct dsa_port *dp = dsa_to_port(ctx->ds, port);
if (enabled)
- return dsa_port_vid_add(dp, vid, flags);
+ return ctx->ops->vlan_add(ctx->ds, dp->index, vid, flags);
- return dsa_port_vid_del(dp, vid);
+ return ctx->ops->vlan_del(ctx->ds, dp->index, vid);
}
/* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single
@@ -209,25 +209,29 @@ static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid,
* +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+
* swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3
*/
-int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
+static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port,
+ bool enabled)
{
- int upstream = dsa_upstream_port(ds, port);
- u16 rx_vid = dsa_8021q_rx_vid(ds, port);
- u16 tx_vid = dsa_8021q_tx_vid(ds, port);
- int i, err;
+ int upstream = dsa_upstream_port(ctx->ds, port);
+ u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port);
+ u16 tx_vid = dsa_8021q_tx_vid(ctx->ds, port);
+ struct net_device *master;
+ int i, err, subvlan;
/* The CPU port is implicitly configured by
* configuring the front-panel ports
*/
- if (!dsa_is_user_port(ds, port))
+ if (!dsa_is_user_port(ctx->ds, port))
return 0;
+ master = dsa_to_port(ctx->ds, port)->cpu_dp->master;
+
/* Add this user port's RX VID to the membership list of all others
* (including itself). This is so that bridging will not be hindered.
* L2 forwarding rules still take precedence when there are no VLAN
* restrictions, so there are no concerns about leaking traffic.
*/
- for (i = 0; i < ds->num_ports; i++) {
+ for (i = 0; i < ctx->ds->num_ports; i++) {
u16 flags;
if (i == upstream)
@@ -240,9 +244,10 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
/* The RX VID is a regular VLAN on all others */
flags = BRIDGE_VLAN_INFO_UNTAGGED;
- err = dsa_8021q_vid_apply(ds, i, rx_vid, flags, enabled);
+ err = dsa_8021q_vid_apply(ctx, i, rx_vid, flags, enabled);
if (err) {
- dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n",
+ dev_err(ctx->ds->dev,
+ "Failed to apply RX VID %d to port %d: %d\n",
rx_vid, port, err);
return err;
}
@@ -251,80 +256,115 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
/* CPU port needs to see this port's RX VID
* as tagged egress.
*/
- err = dsa_8021q_vid_apply(ds, upstream, rx_vid, 0, enabled);
+ err = dsa_8021q_vid_apply(ctx, upstream, rx_vid, 0, enabled);
if (err) {
- dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n",
+ dev_err(ctx->ds->dev,
+ "Failed to apply RX VID %d to port %d: %d\n",
rx_vid, port, err);
return err;
}
+ /* Add to the master's RX filter not only @rx_vid, but in fact
+ * the entire subvlan range, just in case this DSA switch might
+ * want to use sub-VLANs.
+ */
+ for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++) {
+ u16 vid = dsa_8021q_rx_vid_subvlan(ctx->ds, port, subvlan);
+
+ if (enabled)
+ vlan_vid_add(master, ctx->proto, vid);
+ else
+ vlan_vid_del(master, ctx->proto, vid);
+ }
+
/* Finally apply the TX VID on this port and on the CPU port */
- err = dsa_8021q_vid_apply(ds, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED,
+ err = dsa_8021q_vid_apply(ctx, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED,
enabled);
if (err) {
- dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n",
+ dev_err(ctx->ds->dev,
+ "Failed to apply TX VID %d on port %d: %d\n",
tx_vid, port, err);
return err;
}
- err = dsa_8021q_vid_apply(ds, upstream, tx_vid, 0, enabled);
+ err = dsa_8021q_vid_apply(ctx, upstream, tx_vid, 0, enabled);
if (err) {
- dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n",
+ dev_err(ctx->ds->dev,
+ "Failed to apply TX VID %d on port %d: %d\n",
tx_vid, upstream, err);
return err;
}
return err;
}
-EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging);
-static int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
- struct dsa_switch *other_ds,
+int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled)
+{
+ int rc, port;
+
+ ASSERT_RTNL();
+
+ for (port = 0; port < ctx->ds->num_ports; port++) {
+ rc = dsa_8021q_setup_port(ctx, port, enabled);
+ if (rc < 0) {
+ dev_err(ctx->ds->dev,
+ "Failed to setup VLAN tagging for port %d: %d\n",
+ port, rc);
+ return rc;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_setup);
+
+static int dsa_8021q_crosschip_link_apply(struct dsa_8021q_context *ctx,
+ int port,
+ struct dsa_8021q_context *other_ctx,
int other_port, bool enabled)
{
- u16 rx_vid = dsa_8021q_rx_vid(ds, port);
+ u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port);
/* @rx_vid of local @ds port @port goes to @other_port of
* @other_ds
*/
- return dsa_8021q_vid_apply(other_ds, other_port, rx_vid,
+ return dsa_8021q_vid_apply(other_ctx, other_port, rx_vid,
BRIDGE_VLAN_INFO_UNTAGGED, enabled);
}
-static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port,
- struct dsa_switch *other_ds,
- int other_port,
- struct list_head *crosschip_links)
+static int dsa_8021q_crosschip_link_add(struct dsa_8021q_context *ctx, int port,
+ struct dsa_8021q_context *other_ctx,
+ int other_port)
{
struct dsa_8021q_crosschip_link *c;
- list_for_each_entry(c, crosschip_links, list) {
- if (c->port == port && c->other_ds == other_ds &&
+ list_for_each_entry(c, &ctx->crosschip_links, list) {
+ if (c->port == port && c->other_ctx == other_ctx &&
c->other_port == other_port) {
refcount_inc(&c->refcount);
return 0;
}
}
- dev_dbg(ds->dev, "adding crosschip link from port %d to %s port %d\n",
- port, dev_name(other_ds->dev), other_port);
+ dev_dbg(ctx->ds->dev,
+ "adding crosschip link from port %d to %s port %d\n",
+ port, dev_name(other_ctx->ds->dev), other_port);
c = kzalloc(sizeof(*c), GFP_KERNEL);
if (!c)
return -ENOMEM;
c->port = port;
- c->other_ds = other_ds;
+ c->other_ctx = other_ctx;
c->other_port = other_port;
refcount_set(&c->refcount, 1);
- list_add(&c->list, crosschip_links);
+ list_add(&c->list, &ctx->crosschip_links);
return 0;
}
-static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds,
+static void dsa_8021q_crosschip_link_del(struct dsa_8021q_context *ctx,
struct dsa_8021q_crosschip_link *c,
- struct list_head *crosschip_links,
bool *keep)
{
*keep = !refcount_dec_and_test(&c->refcount);
@@ -332,9 +372,9 @@ static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds,
if (*keep)
return;
- dev_dbg(ds->dev,
+ dev_dbg(ctx->ds->dev,
"deleting crosschip link from port %d to %s port %d\n",
- c->port, dev_name(c->other_ds->dev), c->other_port);
+ c->port, dev_name(c->other_ctx->ds->dev), c->other_port);
list_del(&c->list);
kfree(c);
@@ -347,64 +387,58 @@ static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds,
* or untagged: it doesn't matter, since it should never egress a frame having
* our @rx_vid.
*/
-int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
- struct dsa_switch *other_ds,
- int other_port,
- struct list_head *crosschip_links)
+int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port,
+ struct dsa_8021q_context *other_ctx,
+ int other_port)
{
/* @other_upstream is how @other_ds reaches us. If we are part
* of disjoint trees, then we are probably connected through
* our CPU ports. If we're part of the same tree though, we should
* probably use dsa_towards_port.
*/
- int other_upstream = dsa_upstream_port(other_ds, other_port);
+ int other_upstream = dsa_upstream_port(other_ctx->ds, other_port);
int rc;
- rc = dsa_8021q_crosschip_link_add(ds, port, other_ds,
- other_port, crosschip_links);
+ rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_port);
if (rc)
return rc;
- rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds,
+ rc = dsa_8021q_crosschip_link_apply(ctx, port, other_ctx,
other_port, true);
if (rc)
return rc;
- rc = dsa_8021q_crosschip_link_add(ds, port, other_ds,
- other_upstream,
- crosschip_links);
+ rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_upstream);
if (rc)
return rc;
- return dsa_8021q_crosschip_link_apply(ds, port, other_ds,
+ return dsa_8021q_crosschip_link_apply(ctx, port, other_ctx,
other_upstream, true);
}
EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join);
-int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
- struct dsa_switch *other_ds,
- int other_port,
- struct list_head *crosschip_links)
+int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port,
+ struct dsa_8021q_context *other_ctx,
+ int other_port)
{
- int other_upstream = dsa_upstream_port(other_ds, other_port);
+ int other_upstream = dsa_upstream_port(other_ctx->ds, other_port);
struct dsa_8021q_crosschip_link *c, *n;
- list_for_each_entry_safe(c, n, crosschip_links, list) {
- if (c->port == port && c->other_ds == other_ds &&
+ list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) {
+ if (c->port == port && c->other_ctx == other_ctx &&
(c->other_port == other_port ||
c->other_port == other_upstream)) {
- struct dsa_switch *other_ds = c->other_ds;
+ struct dsa_8021q_context *other_ctx = c->other_ctx;
int other_port = c->other_port;
bool keep;
int rc;
- dsa_8021q_crosschip_link_del(ds, c, crosschip_links,
- &keep);
+ dsa_8021q_crosschip_link_del(ctx, c, &keep);
if (keep)
continue;
- rc = dsa_8021q_crosschip_link_apply(ds, port,
- other_ds,
+ rc = dsa_8021q_crosschip_link_apply(ctx, port,
+ other_ctx,
other_port,
false);
if (rc)
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
index 55b00694cdba..002cf7f952e2 100644
--- a/net/dsa/tag_ar9331.c
+++ b/net/dsa/tag_ar9331.c
@@ -31,9 +31,6 @@ static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb,
__le16 *phdr;
u16 hdr;
- if (skb_cow_head(skb, AR9331_HDR_LEN) < 0)
- return NULL;
-
phdr = skb_push(skb, AR9331_HDR_LEN);
hdr = FIELD_PREP(AR9331_HDR_VERSION_MASK, AR9331_HDR_VERSION);
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index cc8512b5f9e2..e934dace3922 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -66,9 +66,6 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
u16 queue = skb_get_queue_mapping(skb);
u8 *brcm_tag;
- if (skb_cow_head(skb, BRCM_TAG_LEN) < 0)
- return NULL;
-
/* The Ethernet switch we are interfaced with needs packets to be at
* least 64 bytes (including FCS) otherwise they will be discarded when
* they enter the switch port logic. When Broadcom tags are enabled, we
@@ -107,6 +104,18 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
return skb;
}
+/* Frames with this tag have one of these two layouts:
+ * -----------------------------------
+ * | MAC DA | MAC SA | 4b tag | Type | DSA_TAG_PROTO_BRCM
+ * -----------------------------------
+ * -----------------------------------
+ * | 4b tag | MAC DA | MAC SA | Type | DSA_TAG_PROTO_BRCM_PREPEND
+ * -----------------------------------
+ * In both cases, at receive time, skb->data points 2 bytes before the actual
+ * Ethernet type field and we have an offset of 4bytes between where skb->data
+ * and where the payload starts. So the same low-level receive function can be
+ * used.
+ */
static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
struct net_device *dev,
struct packet_type *pt,
@@ -144,27 +153,6 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
return skb;
}
-
-static int brcm_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto,
- int *offset)
-{
- /* We have been called on the DSA master network device after
- * eth_type_trans() which pulled the Ethernet header already.
- * Frames have one of these two layouts:
- * -----------------------------------
- * | MAC DA | MAC SA | 4b tag | Type | DSA_TAG_PROTO_BRCM
- * -----------------------------------
- * -----------------------------------
- * | 4b tag | MAC DA | MAC SA | Type | DSA_TAG_PROTO_BRCM_PREPEND
- * -----------------------------------
- * skb->data points 2 bytes before the actual Ethernet type field and
- * we have an offset of 4bytes between where skb->data and where the
- * payload starts.
- */
- *offset = BRCM_TAG_LEN;
- *proto = ((__be16 *)skb->data)[1];
- return 0;
-}
#endif
#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM)
@@ -200,7 +188,6 @@ static const struct dsa_device_ops brcm_netdev_ops = {
.xmit = brcm_tag_xmit,
.rcv = brcm_tag_rcv,
.overhead = BRCM_TAG_LEN,
- .flow_dissect = brcm_tag_flow_dissect,
};
DSA_TAG_DRIVER(brcm_netdev_ops);
@@ -229,7 +216,6 @@ static const struct dsa_device_ops brcm_prepend_netdev_ops = {
.xmit = brcm_tag_xmit_prepend,
.rcv = brcm_tag_rcv_prepend,
.overhead = BRCM_TAG_LEN,
- .flow_dissect = brcm_tag_flow_dissect,
};
DSA_TAG_DRIVER(brcm_prepend_netdev_ops);
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 7ddec9794477..112c7c6dd568 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -1,7 +1,48 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging
+ * Regular and Ethertype DSA tagging
* Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * Regular DSA
+ * -----------
+
+ * For untagged (in 802.1Q terms) packets, the switch will splice in
+ * the tag between the SA and the ethertype of the original
+ * packet. Tagged frames will instead have their outermost .1Q tag
+ * converted to a DSA tag. It expects the same layout when receiving
+ * packets from the CPU.
+ *
+ * Example:
+ *
+ * .----.----.----.---------
+ * Pu: | DA | SA | ET | Payload ...
+ * '----'----'----'---------
+ * 6 6 2 N
+ * .----.----.--------.-----.----.---------
+ * Pt: | DA | SA | 0x8100 | TCI | ET | Payload ...
+ * '----'----'--------'-----'----'---------
+ * 6 6 2 2 2 N
+ * .----.----.-----.----.---------
+ * Pd: | DA | SA | DSA | ET | Payload ...
+ * '----'----'-----'----'---------
+ * 6 6 4 2 N
+ *
+ * No matter if a packet is received untagged (Pu) or tagged (Pt),
+ * they will both have the same layout (Pd) when they are sent to the
+ * CPU. This is done by ignoring 802.3, replacing the ethertype field
+ * with more metadata, among which is a bit to signal if the original
+ * packet was tagged or not.
+ *
+ * Ethertype DSA
+ * -------------
+ * Uses the exact same tag format as regular DSA, but also includes a
+ * proper ethertype field (which the mv88e6xxx driver sets to
+ * ETH_P_EDSA/0xdada) followed by two zero bytes:
+ *
+ * .----.----.--------.--------.-----.----.---------
+ * | DA | SA | 0xdada | 0x0000 | DSA | ET | Payload ...
+ * '----'----'--------'--------'-----'----'---------
+ * 6 6 2 2 4 2 N
*/
#include <linux/etherdevice.h>
@@ -12,46 +53,104 @@
#define DSA_HLEN 4
-static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
+/**
+ * enum dsa_cmd - DSA Command
+ * @DSA_CMD_TO_CPU: Set on packets that were trapped or mirrored to
+ * the CPU port. This is needed to implement control protocols,
+ * e.g. STP and LLDP, that must not allow those control packets to
+ * be switched according to the normal rules.
+ * @DSA_CMD_FROM_CPU: Used by the CPU to send a packet to a specific
+ * port, ignoring all the barriers that the switch normally
+ * enforces (VLANs, STP port states etc.). No source address
+ * learning takes place. "sudo send packet"
+ * @DSA_CMD_TO_SNIFFER: Set on the copies of packets that matched some
+ * user configured ingress or egress monitor criteria. These are
+ * forwarded by the switch tree to the user configured ingress or
+ * egress monitor port, which can be set to the CPU port or a
+ * regular port. If the destination is a regular port, the tag
+ * will be removed before egressing the port. If the destination
+ * is the CPU port, the tag will not be removed.
+ * @DSA_CMD_FORWARD: This tag is used on all bulk traffic passing
+ * through the switch tree, including the flows that are directed
+ * towards the CPU. Its device/port tuple encodes the original
+ * source port on which the packet ingressed. It can also be used
+ * on transmit by the CPU to defer the forwarding decision to the
+ * hardware, based on the current config of PVT/VTU/ATU
+ * etc. Source address learning takes places if enabled on the
+ * receiving DSA/CPU port.
+ */
+enum dsa_cmd {
+ DSA_CMD_TO_CPU = 0,
+ DSA_CMD_FROM_CPU = 1,
+ DSA_CMD_TO_SNIFFER = 2,
+ DSA_CMD_FORWARD = 3
+};
+
+/**
+ * enum dsa_code - TO_CPU Code
+ *
+ * @DSA_CODE_MGMT_TRAP: DA was classified as a management
+ * address. Typical examples include STP BPDUs and LLDP.
+ * @DSA_CODE_FRAME2REG: Response to a "remote management" request.
+ * @DSA_CODE_IGMP_MLD_TRAP: IGMP/MLD signaling.
+ * @DSA_CODE_POLICY_TRAP: Frame matched some policy configuration on
+ * the device. Typical examples are matching on DA/SA/VID and DHCP
+ * snooping.
+ * @DSA_CODE_ARP_MIRROR: The name says it all really.
+ * @DSA_CODE_POLICY_MIRROR: Same as @DSA_CODE_POLICY_TRAP, but the
+ * particular policy was set to trigger a mirror instead of a
+ * trap.
+ * @DSA_CODE_RESERVED_6: Unused on all devices up to at least 6393X.
+ * @DSA_CODE_RESERVED_7: Unused on all devices up to at least 6393X.
+ *
+ * A 3-bit code is used to relay why a particular frame was sent to
+ * the CPU. We only use this to determine if the packet was mirrored
+ * or trapped, i.e. whether the packet has been forwarded by hardware
+ * or not.
+ *
+ * This is the superset of all possible codes. Any particular device
+ * may only implement a subset.
+ */
+enum dsa_code {
+ DSA_CODE_MGMT_TRAP = 0,
+ DSA_CODE_FRAME2REG = 1,
+ DSA_CODE_IGMP_MLD_TRAP = 2,
+ DSA_CODE_POLICY_TRAP = 3,
+ DSA_CODE_ARP_MIRROR = 4,
+ DSA_CODE_POLICY_MIRROR = 5,
+ DSA_CODE_RESERVED_6 = 6,
+ DSA_CODE_RESERVED_7 = 7
+};
+
+static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev,
+ u8 extra)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
u8 *dsa_header;
- /*
- * Convert the outermost 802.1q tag to a DSA tag for tagged
- * packets, or insert a DSA tag between the addresses and
- * the ethertype field for untagged packets.
- */
if (skb->protocol == htons(ETH_P_8021Q)) {
- if (skb_cow_head(skb, 0) < 0)
- return NULL;
+ if (extra) {
+ skb_push(skb, extra);
+ memmove(skb->data, skb->data + extra, 2 * ETH_ALEN);
+ }
- /*
- * Construct tagged FROM_CPU DSA tag from 802.1q tag.
- */
- dsa_header = skb->data + 2 * ETH_ALEN;
- dsa_header[0] = 0x60 | dp->ds->index;
+ /* Construct tagged FROM_CPU DSA tag from 802.1Q tag. */
+ dsa_header = skb->data + 2 * ETH_ALEN + extra;
+ dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | 0x20 | dp->ds->index;
dsa_header[1] = dp->index << 3;
- /*
- * Move CFI field from byte 2 to byte 1.
- */
+ /* Move CFI field from byte 2 to byte 1. */
if (dsa_header[2] & 0x10) {
dsa_header[1] |= 0x01;
dsa_header[2] &= ~0x10;
}
} else {
- if (skb_cow_head(skb, DSA_HLEN) < 0)
- return NULL;
- skb_push(skb, DSA_HLEN);
+ skb_push(skb, DSA_HLEN + extra);
+ memmove(skb->data, skb->data + DSA_HLEN + extra, 2 * ETH_ALEN);
- memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN);
-
- /*
- * Construct untagged FROM_CPU DSA tag.
- */
- dsa_header = skb->data + 2 * ETH_ALEN;
- dsa_header[0] = 0x40 | dp->ds->index;
+ /* Construct untagged FROM_CPU DSA tag. */
+ dsa_header = skb->data + 2 * ETH_ALEN + extra;
+ dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | dp->ds->index;
dsa_header[1] = dp->index << 3;
dsa_header[2] = 0x00;
dsa_header[3] = 0x00;
@@ -60,30 +159,60 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
return skb;
}
-static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
+ u8 extra)
{
+ int source_device, source_port;
+ enum dsa_code code;
+ enum dsa_cmd cmd;
u8 *dsa_header;
- int source_device;
- int source_port;
- if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
- return NULL;
-
- /*
- * The ethertype field is part of the DSA header.
- */
+ /* The ethertype field is part of the DSA header. */
dsa_header = skb->data - 2;
- /*
- * Check that frame type is either TO_CPU or FORWARD.
- */
- if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0)
+ cmd = dsa_header[0] >> 6;
+ switch (cmd) {
+ case DSA_CMD_FORWARD:
+ skb->offload_fwd_mark = 1;
+ break;
+
+ case DSA_CMD_TO_CPU:
+ code = (dsa_header[1] & 0x6) | ((dsa_header[2] >> 4) & 1);
+
+ switch (code) {
+ case DSA_CODE_FRAME2REG:
+ /* Remote management is not implemented yet,
+ * drop.
+ */
+ return NULL;
+ case DSA_CODE_ARP_MIRROR:
+ case DSA_CODE_POLICY_MIRROR:
+ /* Mark mirrored packets to notify any upper
+ * device (like a bridge) that forwarding has
+ * already been done by hardware.
+ */
+ skb->offload_fwd_mark = 1;
+ break;
+ case DSA_CODE_MGMT_TRAP:
+ case DSA_CODE_IGMP_MLD_TRAP:
+ case DSA_CODE_POLICY_TRAP:
+ /* Traps have, by definition, not been
+ * forwarded by hardware, so don't mark them.
+ */
+ break;
+ default:
+ /* Reserved code, this could be anything. Drop
+ * seems like the safest option.
+ */
+ return NULL;
+ }
+
+ break;
+
+ default:
return NULL;
+ }
- /*
- * Determine source device and port.
- */
source_device = dsa_header[0] & 0x1f;
source_port = (dsa_header[1] >> 3) & 0x1f;
@@ -91,16 +220,15 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
if (!skb->dev)
return NULL;
- /*
- * Convert the DSA header to an 802.1q header if the 'tagged'
- * bit in the DSA header is set. If the 'tagged' bit is clear,
- * delete the DSA header entirely.
+ /* If the 'tagged' bit is set; convert the DSA tag to a 802.1Q
+ * tag, and delete the ethertype (extra) if applicable. If the
+ * 'tagged' bit is cleared; delete the DSA tag, and ethertype
+ * if applicable.
*/
if (dsa_header[0] & 0x20) {
u8 new_header[4];
- /*
- * Insert 802.1q ethertype and copy the VLAN-related
+ /* Insert 802.1Q ethertype and copy the VLAN-related
* fields, but clear the bit that will hold CFI (since
* DSA uses that bit location for another purpose).
*/
@@ -109,16 +237,13 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
new_header[2] = dsa_header[2] & ~0x10;
new_header[3] = dsa_header[3];
- /*
- * Move CFI bit from its place in the DSA header to
- * its 802.1q-designated place.
+ /* Move CFI bit from its place in the DSA header to
+ * its 802.1Q-designated place.
*/
if (dsa_header[1] & 0x01)
new_header[2] |= 0x10;
- /*
- * Update packet checksum if skb is CHECKSUM_COMPLETE.
- */
+ /* Update packet checksum if skb is CHECKSUM_COMPLETE. */
if (skb->ip_summed == CHECKSUM_COMPLETE) {
__wsum c = skb->csum;
c = csum_add(c, csum_partial(new_header + 2, 2, 0));
@@ -127,39 +252,101 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
}
memcpy(dsa_header, new_header, DSA_HLEN);
+
+ if (extra)
+ memmove(skb->data - ETH_HLEN,
+ skb->data - ETH_HLEN - extra,
+ 2 * ETH_ALEN);
} else {
- /*
- * Remove DSA tag and update checksum.
- */
skb_pull_rcsum(skb, DSA_HLEN);
memmove(skb->data - ETH_HLEN,
- skb->data - ETH_HLEN - DSA_HLEN,
+ skb->data - ETH_HLEN - DSA_HLEN - extra,
2 * ETH_ALEN);
}
- skb->offload_fwd_mark = 1;
-
return skb;
}
-static int dsa_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto,
- int *offset)
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_DSA)
+
+static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
{
- *offset = 4;
- *proto = ((__be16 *)skb->data)[1];
- return 0;
+ return dsa_xmit_ll(skb, dev, 0);
+}
+
+static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt)
+{
+ if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
+ return NULL;
+
+ return dsa_rcv_ll(skb, dev, 0);
}
static const struct dsa_device_ops dsa_netdev_ops = {
- .name = "dsa",
- .proto = DSA_TAG_PROTO_DSA,
- .xmit = dsa_xmit,
- .rcv = dsa_rcv,
- .flow_dissect = dsa_tag_flow_dissect,
+ .name = "dsa",
+ .proto = DSA_TAG_PROTO_DSA,
+ .xmit = dsa_xmit,
+ .rcv = dsa_rcv,
.overhead = DSA_HLEN,
};
-MODULE_LICENSE("GPL");
+DSA_TAG_DRIVER(dsa_netdev_ops);
MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_DSA);
+#endif /* CONFIG_NET_DSA_TAG_DSA */
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA)
+
+#define EDSA_HLEN 8
+
+static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ u8 *edsa_header;
+
+ skb = dsa_xmit_ll(skb, dev, EDSA_HLEN - DSA_HLEN);
+ if (!skb)
+ return NULL;
-module_dsa_tag_driver(dsa_netdev_ops);
+ edsa_header = skb->data + 2 * ETH_ALEN;
+ edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
+ edsa_header[1] = ETH_P_EDSA & 0xff;
+ edsa_header[2] = 0x00;
+ edsa_header[3] = 0x00;
+ return skb;
+}
+
+static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt)
+{
+ if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
+ return NULL;
+
+ skb_pull_rcsum(skb, EDSA_HLEN - DSA_HLEN);
+
+ return dsa_rcv_ll(skb, dev, EDSA_HLEN - DSA_HLEN);
+}
+
+static const struct dsa_device_ops edsa_netdev_ops = {
+ .name = "edsa",
+ .proto = DSA_TAG_PROTO_EDSA,
+ .xmit = edsa_xmit,
+ .rcv = edsa_rcv,
+ .overhead = EDSA_HLEN,
+};
+
+DSA_TAG_DRIVER(edsa_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA);
+#endif /* CONFIG_NET_DSA_TAG_EDSA */
+
+static struct dsa_tag_driver *dsa_tag_drivers[] = {
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_DSA)
+ &DSA_TAG_DRIVER_NAME(dsa_netdev_ops),
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA)
+ &DSA_TAG_DRIVER_NAME(edsa_netdev_ops),
+#endif
+};
+
+module_dsa_tag_drivers(dsa_tag_drivers);
+
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
deleted file mode 100644
index d6200ff98200..000000000000
--- a/net/dsa/tag_edsa.c
+++ /dev/null
@@ -1,215 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * net/dsa/tag_edsa.c - Ethertype DSA tagging
- * Copyright (c) 2008-2009 Marvell Semiconductor
- */
-
-#include <linux/etherdevice.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-
-#include "dsa_priv.h"
-
-#define DSA_HLEN 4
-#define EDSA_HLEN 8
-
-#define FRAME_TYPE_TO_CPU 0x00
-#define FRAME_TYPE_FORWARD 0x03
-
-#define TO_CPU_CODE_MGMT_TRAP 0x00
-#define TO_CPU_CODE_FRAME2REG 0x01
-#define TO_CPU_CODE_IGMP_MLD_TRAP 0x02
-#define TO_CPU_CODE_POLICY_TRAP 0x03
-#define TO_CPU_CODE_ARP_MIRROR 0x04
-#define TO_CPU_CODE_POLICY_MIRROR 0x05
-
-static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- u8 *edsa_header;
-
- /*
- * Convert the outermost 802.1q tag to a DSA tag and prepend
- * a DSA ethertype field is the packet is tagged, or insert
- * a DSA ethertype plus DSA tag between the addresses and the
- * current ethertype field if the packet is untagged.
- */
- if (skb->protocol == htons(ETH_P_8021Q)) {
- if (skb_cow_head(skb, DSA_HLEN) < 0)
- return NULL;
- skb_push(skb, DSA_HLEN);
-
- memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN);
-
- /*
- * Construct tagged FROM_CPU DSA tag from 802.1q tag.
- */
- edsa_header = skb->data + 2 * ETH_ALEN;
- edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
- edsa_header[1] = ETH_P_EDSA & 0xff;
- edsa_header[2] = 0x00;
- edsa_header[3] = 0x00;
- edsa_header[4] = 0x60 | dp->ds->index;
- edsa_header[5] = dp->index << 3;
-
- /*
- * Move CFI field from byte 6 to byte 5.
- */
- if (edsa_header[6] & 0x10) {
- edsa_header[5] |= 0x01;
- edsa_header[6] &= ~0x10;
- }
- } else {
- if (skb_cow_head(skb, EDSA_HLEN) < 0)
- return NULL;
- skb_push(skb, EDSA_HLEN);
-
- memmove(skb->data, skb->data + EDSA_HLEN, 2 * ETH_ALEN);
-
- /*
- * Construct untagged FROM_CPU DSA tag.
- */
- edsa_header = skb->data + 2 * ETH_ALEN;
- edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
- edsa_header[1] = ETH_P_EDSA & 0xff;
- edsa_header[2] = 0x00;
- edsa_header[3] = 0x00;
- edsa_header[4] = 0x40 | dp->ds->index;
- edsa_header[5] = dp->index << 3;
- edsa_header[6] = 0x00;
- edsa_header[7] = 0x00;
- }
-
- return skb;
-}
-
-static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
-{
- u8 *edsa_header;
- int frame_type;
- int code;
- int source_device;
- int source_port;
-
- if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
- return NULL;
-
- /*
- * Skip the two null bytes after the ethertype.
- */
- edsa_header = skb->data + 2;
-
- /*
- * Check that frame type is either TO_CPU or FORWARD.
- */
- frame_type = edsa_header[0] >> 6;
-
- switch (frame_type) {
- case FRAME_TYPE_TO_CPU:
- code = (edsa_header[1] & 0x6) | ((edsa_header[2] >> 4) & 1);
-
- /*
- * Mark the frame to never egress on any port of the same switch
- * unless it's a trapped IGMP/MLD packet, in which case the
- * bridge might want to forward it.
- */
- if (code != TO_CPU_CODE_IGMP_MLD_TRAP)
- skb->offload_fwd_mark = 1;
-
- break;
-
- case FRAME_TYPE_FORWARD:
- skb->offload_fwd_mark = 1;
- break;
-
- default:
- return NULL;
- }
-
- /*
- * Determine source device and port.
- */
- source_device = edsa_header[0] & 0x1f;
- source_port = (edsa_header[1] >> 3) & 0x1f;
-
- skb->dev = dsa_master_find_slave(dev, source_device, source_port);
- if (!skb->dev)
- return NULL;
-
- /*
- * If the 'tagged' bit is set, convert the DSA tag to a 802.1q
- * tag and delete the ethertype part. If the 'tagged' bit is
- * clear, delete the ethertype and the DSA tag parts.
- */
- if (edsa_header[0] & 0x20) {
- u8 new_header[4];
-
- /*
- * Insert 802.1q ethertype and copy the VLAN-related
- * fields, but clear the bit that will hold CFI (since
- * DSA uses that bit location for another purpose).
- */
- new_header[0] = (ETH_P_8021Q >> 8) & 0xff;
- new_header[1] = ETH_P_8021Q & 0xff;
- new_header[2] = edsa_header[2] & ~0x10;
- new_header[3] = edsa_header[3];
-
- /*
- * Move CFI bit from its place in the DSA header to
- * its 802.1q-designated place.
- */
- if (edsa_header[1] & 0x01)
- new_header[2] |= 0x10;
-
- skb_pull_rcsum(skb, DSA_HLEN);
-
- /*
- * Update packet checksum if skb is CHECKSUM_COMPLETE.
- */
- if (skb->ip_summed == CHECKSUM_COMPLETE) {
- __wsum c = skb->csum;
- c = csum_add(c, csum_partial(new_header + 2, 2, 0));
- c = csum_sub(c, csum_partial(edsa_header + 2, 2, 0));
- skb->csum = c;
- }
-
- memcpy(edsa_header, new_header, DSA_HLEN);
-
- memmove(skb->data - ETH_HLEN,
- skb->data - ETH_HLEN - DSA_HLEN,
- 2 * ETH_ALEN);
- } else {
- /*
- * Remove DSA tag and update checksum.
- */
- skb_pull_rcsum(skb, EDSA_HLEN);
- memmove(skb->data - ETH_HLEN,
- skb->data - ETH_HLEN - EDSA_HLEN,
- 2 * ETH_ALEN);
- }
-
- return skb;
-}
-
-static int edsa_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto,
- int *offset)
-{
- *offset = 8;
- *proto = ((__be16 *)skb->data)[3];
- return 0;
-}
-
-static const struct dsa_device_ops edsa_netdev_ops = {
- .name = "edsa",
- .proto = DSA_TAG_PROTO_EDSA,
- .xmit = edsa_xmit,
- .rcv = edsa_rcv,
- .flow_dissect = edsa_tag_flow_dissect,
- .overhead = EDSA_HLEN,
-};
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA);
-
-module_dsa_tag_driver(edsa_netdev_ops);
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
index 408d4af390a0..2f5bd5e338ab 100644
--- a/net/dsa/tag_gswip.c
+++ b/net/dsa/tag_gswip.c
@@ -60,13 +60,8 @@ static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb,
struct net_device *dev)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
- int err;
u8 *gswip_tag;
- err = skb_cow_head(skb, GSWIP_TX_HEADER_LEN);
- if (err)
- return NULL;
-
skb_push(skb, GSWIP_TX_HEADER_LEN);
gswip_tag = skb->data;
diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c
new file mode 100644
index 000000000000..a09805c8e1ab
--- /dev/null
+++ b/net/dsa/tag_hellcreek.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * net/dsa/tag_hellcreek.c - Hirschmann Hellcreek switch tag format handling
+ *
+ * Copyright (C) 2019,2020 Linutronix GmbH
+ * Author Kurt Kanzenbach <kurt@linutronix.de>
+ *
+ * Based on tag_ksz.c.
+ */
+
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#include "dsa_priv.h"
+
+#define HELLCREEK_TAG_LEN 1
+
+static struct sk_buff *hellcreek_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ u8 *tag;
+
+ /* Tag encoding */
+ tag = skb_put(skb, HELLCREEK_TAG_LEN);
+ *tag = BIT(dp->index);
+
+ return skb;
+}
+
+static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
+ struct net_device *dev,
+ struct packet_type *pt)
+{
+ /* Tag decoding */
+ u8 *tag = skb_tail_pointer(skb) - HELLCREEK_TAG_LEN;
+ unsigned int port = tag[0] & 0x03;
+
+ skb->dev = dsa_master_find_slave(dev, 0, port);
+ if (!skb->dev) {
+ netdev_warn(dev, "Failed to get source port: %d\n", port);
+ return NULL;
+ }
+
+ pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN);
+
+ skb->offload_fwd_mark = true;
+
+ return skb;
+}
+
+static const struct dsa_device_ops hellcreek_netdev_ops = {
+ .name = "hellcreek",
+ .proto = DSA_TAG_PROTO_HELLCREEK,
+ .xmit = hellcreek_xmit,
+ .rcv = hellcreek_rcv,
+ .overhead = HELLCREEK_TAG_LEN,
+ .tail_tag = true,
+};
+
+MODULE_LICENSE("Dual MIT/GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_HELLCREEK);
+
+module_dsa_tag_driver(hellcreek_netdev_ops);
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index bd1a3158d79a..4820dbcedfa2 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -14,46 +14,6 @@
#define KSZ_EGRESS_TAG_LEN 1
#define KSZ_INGRESS_TAG_LEN 1
-static struct sk_buff *ksz_common_xmit(struct sk_buff *skb,
- struct net_device *dev, int len)
-{
- struct sk_buff *nskb;
- int padlen;
-
- padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len;
-
- if (skb_tailroom(skb) >= padlen + len) {
- /* Let dsa_slave_xmit() free skb */
- if (__skb_put_padto(skb, skb->len + padlen, false))
- return NULL;
-
- nskb = skb;
- } else {
- nskb = alloc_skb(NET_IP_ALIGN + skb->len +
- padlen + len, GFP_ATOMIC);
- if (!nskb)
- return NULL;
- skb_reserve(nskb, NET_IP_ALIGN);
-
- skb_reset_mac_header(nskb);
- skb_set_network_header(nskb,
- skb_network_header(skb) - skb->head);
- skb_set_transport_header(nskb,
- skb_transport_header(skb) - skb->head);
- skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
-
- /* Let skb_put_padto() free nskb, and let dsa_slave_xmit() free
- * skb
- */
- if (skb_put_padto(nskb, nskb->len + padlen))
- return NULL;
-
- consume_skb(skb);
- }
-
- return nskb;
-}
-
static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
struct net_device *dev,
unsigned int port, unsigned int len)
@@ -90,23 +50,18 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct sk_buff *nskb;
u8 *tag;
u8 *addr;
- nskb = ksz_common_xmit(skb, dev, KSZ_INGRESS_TAG_LEN);
- if (!nskb)
- return NULL;
-
/* Tag encoding */
- tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN);
- addr = skb_mac_header(nskb);
+ tag = skb_put(skb, KSZ_INGRESS_TAG_LEN);
+ addr = skb_mac_header(skb);
*tag = 1 << dp->index;
if (is_link_local_ether_addr(addr))
*tag |= KSZ8795_TAIL_TAG_OVERRIDE;
- return nskb;
+ return skb;
}
static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -123,6 +78,7 @@ static const struct dsa_device_ops ksz8795_netdev_ops = {
.xmit = ksz8795_xmit,
.rcv = ksz8795_rcv,
.overhead = KSZ_INGRESS_TAG_LEN,
+ .tail_tag = true,
};
DSA_TAG_DRIVER(ksz8795_netdev_ops);
@@ -155,18 +111,13 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
struct net_device *dev)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct sk_buff *nskb;
__be16 *tag;
u8 *addr;
u16 val;
- nskb = ksz_common_xmit(skb, dev, KSZ9477_INGRESS_TAG_LEN);
- if (!nskb)
- return NULL;
-
/* Tag encoding */
- tag = skb_put(nskb, KSZ9477_INGRESS_TAG_LEN);
- addr = skb_mac_header(nskb);
+ tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN);
+ addr = skb_mac_header(skb);
val = BIT(dp->index);
@@ -175,7 +126,7 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
*tag = cpu_to_be16(val);
- return nskb;
+ return skb;
}
static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -199,6 +150,7 @@ static const struct dsa_device_ops ksz9477_netdev_ops = {
.xmit = ksz9477_xmit,
.rcv = ksz9477_rcv,
.overhead = KSZ9477_INGRESS_TAG_LEN,
+ .tail_tag = true,
};
DSA_TAG_DRIVER(ksz9477_netdev_ops);
@@ -211,24 +163,19 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb,
struct net_device *dev)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct sk_buff *nskb;
u8 *addr;
u8 *tag;
- nskb = ksz_common_xmit(skb, dev, KSZ_INGRESS_TAG_LEN);
- if (!nskb)
- return NULL;
-
/* Tag encoding */
- tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN);
- addr = skb_mac_header(nskb);
+ tag = skb_put(skb, KSZ_INGRESS_TAG_LEN);
+ addr = skb_mac_header(skb);
*tag = BIT(dp->index);
if (is_link_local_ether_addr(addr))
*tag |= KSZ9893_TAIL_TAG_OVERRIDE;
- return nskb;
+ return skb;
}
static const struct dsa_device_ops ksz9893_netdev_ops = {
@@ -237,6 +184,7 @@ static const struct dsa_device_ops ksz9893_netdev_ops = {
.xmit = ksz9893_xmit,
.rcv = ksz9477_rcv,
.overhead = KSZ_INGRESS_TAG_LEN,
+ .tail_tag = true,
};
DSA_TAG_DRIVER(ksz9893_netdev_ops);
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index ccfb6f641bbf..aa1318dccaf0 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -58,15 +58,6 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev)
__be16 *lan9303_tag;
u16 tag;
- /* insert a special VLAN tag between the MAC addresses
- * and the current ethertype field.
- */
- if (skb_cow_head(skb, LAN9303_TAG_LEN) < 0) {
- dev_dbg(&dev->dev,
- "Cannot make room for the special tag. Dropping packet\n");
- return NULL;
- }
-
/* provide 'LAN9303_TAG_LEN' bytes additional space */
skb_push(skb, LAN9303_TAG_LEN);
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index f602fc758d68..38dcdded74c0 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -34,9 +34,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
* table with VID.
*/
if (!skb_vlan_tagged(skb)) {
- if (skb_cow_head(skb, MTK_HDR_LEN) < 0)
- return NULL;
-
skb_push(skb, MTK_HDR_LEN);
memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN);
is_vlan_skb = false;
@@ -105,21 +102,11 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
return skb;
}
-static int mtk_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto,
- int *offset)
-{
- *offset = 4;
- *proto = ((__be16 *)skb->data)[1];
-
- return 0;
-}
-
static const struct dsa_device_ops mtk_netdev_ops = {
.name = "mtk",
.proto = DSA_TAG_PROTO_MTK,
.xmit = mtk_tag_xmit,
.rcv = mtk_tag_rcv,
- .flow_dissect = mtk_tag_flow_dissect,
.overhead = MTK_HDR_LEN,
};
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index b4fc05cafaa6..16a1afd5b8e1 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -137,31 +137,29 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
struct net_device *netdev)
{
struct dsa_port *dp = dsa_slave_to_port(netdev);
+ struct sk_buff *clone = DSA_SKB_CB(skb)->clone;
struct dsa_switch *ds = dp->ds;
struct ocelot *ocelot = ds->priv;
struct ocelot_port *ocelot_port;
+ u8 *prefix, *injection;
u64 qos_class, rew_op;
- u8 *injection;
-
- if (unlikely(skb_cow_head(skb, OCELOT_TAG_LEN) < 0)) {
- netdev_err(netdev, "Cannot make room for tag.\n");
- return NULL;
- }
ocelot_port = ocelot->ports[dp->index];
injection = skb_push(skb, OCELOT_TAG_LEN);
- memcpy(injection, ocelot_port->xmit_template, OCELOT_TAG_LEN);
+ prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN);
+
+ memcpy(prefix, ocelot_port->xmit_template, OCELOT_TOTAL_TAG_LEN);
+
/* Fix up the fields which are not statically determined
* in the template
*/
qos_class = skb->priority;
packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0);
- if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) {
- struct sk_buff *clone = DSA_SKB_CB(skb)->clone;
-
+ /* TX timestamping was requested */
+ if (clone) {
rew_op = ocelot_port->ptp_cmd;
/* Retrieve timestamp ID populated inside skb->cb[0] of the
* clone by ocelot_port_add_txtstamp_skb
@@ -179,19 +177,24 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
struct net_device *netdev,
struct packet_type *pt)
{
+ struct dsa_port *cpu_dp = netdev->dsa_ptr;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct ocelot *ocelot = ds->priv;
u64 src_port, qos_class;
+ u64 vlan_tci, tag_type;
u8 *start = skb->data;
u8 *extraction;
+ u16 vlan_tpid;
/* Revert skb->data by the amount consumed by the DSA master,
* so it points to the beginning of the frame.
*/
skb_push(skb, ETH_HLEN);
- /* We don't care about the long prefix, it is just for easy entrance
+ /* We don't care about the short prefix, it is just for easy entrance
* into the DSA master's RX filter. Discard it now by moving it into
* the headroom.
*/
- skb_pull(skb, OCELOT_LONG_PREFIX_LEN);
+ skb_pull(skb, OCELOT_SHORT_PREFIX_LEN);
/* And skb->data now points to the extraction frame header.
* Keep a pointer to it.
*/
@@ -205,10 +208,12 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
skb_pull(skb, ETH_HLEN);
/* Remove from inet csum the extraction header */
- skb_postpull_rcsum(skb, start, OCELOT_LONG_PREFIX_LEN + OCELOT_TAG_LEN);
+ skb_postpull_rcsum(skb, start, OCELOT_TOTAL_TAG_LEN);
packing(extraction, &src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0);
packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0);
+ packing(extraction, &tag_type, 16, 16, OCELOT_TAG_LEN, UNPACK, 0);
+ packing(extraction, &vlan_tci, 15, 0, OCELOT_TAG_LEN, UNPACK, 0);
skb->dev = dsa_master_find_slave(netdev, 0, src_port);
if (!skb->dev)
@@ -223,6 +228,33 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
skb->offload_fwd_mark = 1;
skb->priority = qos_class;
+ /* Ocelot switches copy frames unmodified to the CPU. However, it is
+ * possible for the user to request a VLAN modification through
+ * VCAP_IS1_ACT_VID_REPLACE_ENA. In this case, what will happen is that
+ * the VLAN ID field from the Extraction Header gets updated, but the
+ * 802.1Q header does not (the classified VLAN only becomes visible on
+ * egress through the "port tag" of front-panel ports).
+ * So, for traffic extracted by the CPU, we want to pick up the
+ * classified VLAN and manually replace the existing 802.1Q header from
+ * the packet with it, so that the operating system is always up to
+ * date with the result of tc-vlan actions.
+ * NOTE: In VLAN-unaware mode, we don't want to do that, we want the
+ * frame to remain unmodified, because the classified VLAN is always
+ * equal to the pvid of the ingress port and should not be used for
+ * processing.
+ */
+ vlan_tpid = tag_type ? ETH_P_8021AD : ETH_P_8021Q;
+
+ if (ocelot->ports[src_port]->vlan_aware &&
+ eth_hdr(skb)->h_proto == htons(vlan_tpid)) {
+ u16 dummy_vlan_tci;
+
+ skb_push_rcsum(skb, ETH_HLEN);
+ __skb_vlan_pop(skb, &dummy_vlan_tci);
+ skb_pull_rcsum(skb, ETH_HLEN);
+ __vlan_hwaccel_put_tag(skb, htons(vlan_tpid), vlan_tci);
+ }
+
return skb;
}
@@ -231,7 +263,8 @@ static const struct dsa_device_ops ocelot_netdev_ops = {
.proto = DSA_TAG_PROTO_OCELOT,
.xmit = ocelot_xmit,
.rcv = ocelot_rcv,
- .overhead = OCELOT_TAG_LEN + OCELOT_LONG_PREFIX_LEN,
+ .overhead = OCELOT_TOTAL_TAG_LEN,
+ .promisc_on_master = true,
};
MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 7066f5e697d7..88181b52f480 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -34,9 +34,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
__be16 *phdr;
u16 hdr;
- if (skb_cow_head(skb, QCA_HDR_LEN) < 0)
- return NULL;
-
skb_push(skb, QCA_HDR_LEN);
memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN);
@@ -89,21 +86,11 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
return skb;
}
-static int qca_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto,
- int *offset)
-{
- *offset = QCA_HDR_LEN;
- *proto = ((__be16 *)skb->data)[0];
-
- return 0;
-}
-
static const struct dsa_device_ops qca_netdev_ops = {
.name = "qca",
.proto = DSA_TAG_PROTO_QCA,
.xmit = qca_tag_xmit,
.rcv = qca_tag_rcv,
- .flow_dissect = qca_tag_flow_dissect,
.overhead = QCA_HDR_LEN,
};
diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c
index 7b63010fa87b..2646abe5a69e 100644
--- a/net/dsa/tag_rtl4_a.c
+++ b/net/dsa/tag_rtl4_a.c
@@ -106,22 +106,11 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
return skb;
}
-static int rtl4a_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto,
- int *offset)
-{
- *offset = RTL4_A_HDR_LEN;
- /* Skip past the tag and fetch the encapsulated Ethertype */
- *proto = ((__be16 *)skb->data)[1];
-
- return 0;
-}
-
static const struct dsa_device_ops rtl4a_netdev_ops = {
.name = "rtl4a",
.proto = DSA_TAG_PROTO_RTL4_A,
.xmit = rtl4a_tag_xmit,
.rcv = rtl4a_tag_rcv,
- .flow_dissect = rtl4a_tag_flow_dissect,
.overhead = RTL4_A_HDR_LEN,
};
module_dsa_tag_driver(rtl4a_netdev_ops);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 9b4a4d719291..50496013cdb7 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -72,14 +72,21 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb)
static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb)
{
struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
+ u16 vlan_tci;
if (hdr->h_vlan_proto == htons(ETH_P_SJA1105))
return true;
- if (hdr->h_vlan_proto != htons(ETH_P_8021Q))
+ if (hdr->h_vlan_proto != htons(ETH_P_8021Q) &&
+ !skb_vlan_tag_present(skb))
return false;
- return vid_is_dsa_8021q(ntohs(hdr->h_vlan_TCI) & VLAN_VID_MASK);
+ if (skb_vlan_tag_present(skb))
+ vlan_tci = skb_vlan_tag_get(skb);
+ else
+ vlan_tci = ntohs(hdr->h_vlan_TCI);
+
+ return vid_is_dsa_8021q(vlan_tci & VLAN_VID_MASK);
}
/* This is the first time the tagger sees the frame on RX.
@@ -283,7 +290,8 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
hdr = eth_hdr(skb);
tpid = ntohs(hdr->h_proto);
- is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q);
+ is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q ||
+ skb_vlan_tag_present(skb));
is_link_local = sja1105_is_link_local(skb);
is_meta = sja1105_is_meta_frame(skb);
@@ -292,7 +300,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
if (is_tagged) {
/* Normal traffic path. */
skb_push_rcsum(skb, ETH_HLEN);
- __skb_vlan_pop(skb, &tci);
+ if (skb_vlan_tag_present(skb)) {
+ tci = skb_vlan_tag_get(skb);
+ __vlan_hwaccel_clear_tag(skb);
+ } else {
+ __skb_vlan_pop(skb, &tci);
+ }
skb_pull_rcsum(skb, ETH_HLEN);
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
@@ -333,6 +346,16 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
is_meta);
}
+static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto,
+ int *offset)
+{
+ /* No tag added for management frames, all ok */
+ if (unlikely(sja1105_is_link_local(skb)))
+ return;
+
+ dsa_tag_generic_flow_dissect(skb, proto, offset);
+}
+
static const struct dsa_device_ops sja1105_netdev_ops = {
.name = "sja1105",
.proto = DSA_TAG_PROTO_SJA1105,
@@ -340,6 +363,8 @@ static const struct dsa_device_ops sja1105_netdev_ops = {
.rcv = sja1105_rcv,
.filter = sja1105_filter,
.overhead = VLAN_HLEN,
+ .flow_dissect = sja1105_flow_dissect,
+ .promisc_on_master = true,
};
MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 4f8ab62f0208..5b97ede56a0f 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -13,42 +13,15 @@
static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct sk_buff *nskb;
- int padlen;
u8 *trailer;
- /*
- * We have to make sure that the trailer ends up as the very
- * last 4 bytes of the packet. This means that we have to pad
- * the packet to the minimum ethernet frame size, if necessary,
- * before adding the trailer.
- */
- padlen = 0;
- if (skb->len < 60)
- padlen = 60 - skb->len;
-
- nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC);
- if (!nskb)
- return NULL;
- skb_reserve(nskb, NET_IP_ALIGN);
-
- skb_reset_mac_header(nskb);
- skb_set_network_header(nskb, skb_network_header(skb) - skb->head);
- skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head);
- skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
- consume_skb(skb);
-
- if (padlen) {
- skb_put_zero(nskb, padlen);
- }
-
- trailer = skb_put(nskb, 4);
+ trailer = skb_put(skb, 4);
trailer[0] = 0x80;
trailer[1] = 1 << dp->index;
trailer[2] = 0x10;
trailer[3] = 0x00;
- return nskb;
+ return skb;
}
static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -83,6 +56,7 @@ static const struct dsa_device_ops trailer_netdev_ops = {
.xmit = trailer_xmit,
.rcv = trailer_rcv,
.overhead = 4,
+ .tail_tag = true,
};
MODULE_LICENSE("GPL");
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index dac65180c4ef..4106373180c6 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -272,7 +272,7 @@ void eth_header_cache_update(struct hh_cache *hh,
EXPORT_SYMBOL(eth_header_cache_update);
/**
- * eth_header_parser_protocol - extract protocol from L2 header
+ * eth_header_parse_protocol - extract protocol from L2 header
* @skb: packet to extract protocol from
*/
__be16 eth_header_parse_protocol(const struct sk_buff *skb)
@@ -523,8 +523,8 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
EXPORT_SYMBOL(eth_platform_get_mac_address);
/**
- * Obtain the MAC address from an nvmem cell named 'mac-address' associated
- * with given device.
+ * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named
+ * 'mac-address' associated with given device.
*
* @dev: Device with which the mac-address cell is associated.
* @addrbuf: Buffer to which the MAC address will be copied on success.
diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c
index dae7402eaca3..0515d6604b3b 100644
--- a/net/ethtool/bitset.c
+++ b/net/ethtool/bitset.c
@@ -302,8 +302,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = {
- [ETHTOOL_A_BITSET_UNSPEC] = { .type = NLA_REJECT },
+static const struct nla_policy bitset_policy[] = {
[ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG },
[ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32,
ETHNL_MAX_BITSET_SIZE),
@@ -312,8 +311,7 @@ static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = {
[ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY },
};
-static const struct nla_policy bit_policy[ETHTOOL_A_BITSET_BIT_MAX + 1] = {
- [ETHTOOL_A_BITSET_BIT_UNSPEC] = { .type = NLA_REJECT },
+static const struct nla_policy bit_policy[] = {
[ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 },
[ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING },
[ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG },
@@ -329,10 +327,10 @@ static const struct nla_policy bit_policy[ETHTOOL_A_BITSET_BIT_MAX + 1] = {
*/
int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact)
{
- struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1];
+ struct nlattr *tb[ARRAY_SIZE(bitset_policy)];
int ret;
- ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, bitset,
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, bitset,
bitset_policy, NULL);
if (ret < 0)
return ret;
@@ -381,10 +379,10 @@ static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits,
ethnl_string_array_t names,
struct netlink_ext_ack *extack)
{
- struct nlattr *tb[ETHTOOL_A_BITSET_BIT_MAX + 1];
+ struct nlattr *tb[ARRAY_SIZE(bit_policy)];
int ret, idx;
- ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_BIT_MAX, bit_attr,
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bit_policy) - 1, bit_attr,
bit_policy, extack);
if (ret < 0)
return ret;
@@ -555,15 +553,15 @@ int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits,
const struct nlattr *attr, ethnl_string_array_t names,
struct netlink_ext_ack *extack, bool *mod)
{
- struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1];
+ struct nlattr *tb[ARRAY_SIZE(bitset_policy)];
unsigned int change_bits;
bool no_mask;
int ret;
if (!attr)
return 0;
- ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, attr, bitset_policy,
- extack);
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr,
+ bitset_policy, extack);
if (ret < 0)
return ret;
@@ -608,7 +606,7 @@ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask,
ethnl_string_array_t names,
struct netlink_ext_ack *extack)
{
- struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1];
+ struct nlattr *tb[ARRAY_SIZE(bitset_policy)];
const struct nlattr *bit_attr;
bool no_mask;
int rem;
@@ -616,8 +614,8 @@ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask,
if (!attr)
return 0;
- ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, attr, bitset_policy,
- extack);
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr,
+ bitset_policy, extack);
if (ret < 0)
return ret;
no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
@@ -630,6 +628,8 @@ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask,
return ret;
change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]);
+ if (change_bits > nbits)
+ change_bits = nbits;
bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]),
change_bits);
if (change_bits < nbits)
diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
index 888f6e101f34..63560bbb7d1f 100644
--- a/net/ethtool/cabletest.c
+++ b/net/ethtool/cabletest.c
@@ -11,10 +11,9 @@
*/
#define MAX_CABLE_LENGTH_CM (150 * 100)
-static const struct nla_policy
-cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = {
- [ETHTOOL_A_CABLE_TEST_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_CABLE_TEST_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_cable_test_act_policy[] = {
+ [ETHTOOL_A_CABLE_TEST_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int ethnl_cable_test_started(struct phy_device *phydev, u8 cmd)
@@ -56,18 +55,12 @@ out:
int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_CABLE_TEST_MAX + 1];
struct ethnl_req_info req_info = {};
const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
struct net_device *dev;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
- ETHTOOL_A_CABLE_TEST_MAX,
- cable_test_act_policy, info->extack);
- if (ret < 0)
- return ret;
-
ret = ethnl_parse_header_dev_get(&req_info,
tb[ETHTOOL_A_CABLE_TEST_HEADER],
genl_info_net(info), info->extack,
@@ -218,18 +211,16 @@ struct cable_test_tdr_req_info {
struct ethnl_req_info base;
};
-static const struct nla_policy
-cable_test_tdr_act_cfg_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1] = {
+static const struct nla_policy cable_test_tdr_act_cfg_policy[] = {
[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 },
[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST] = { .type = NLA_U32 },
[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP] = { .type = NLA_U32 },
[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR] = { .type = NLA_U8 },
};
-static const struct nla_policy
-cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = {
- [ETHTOOL_A_CABLE_TEST_TDR_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_cable_test_tdr_act_policy[] = {
+ [ETHTOOL_A_CABLE_TEST_TDR_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_CABLE_TEST_TDR_CFG] = { .type = NLA_NESTED },
};
@@ -238,7 +229,7 @@ static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest,
struct genl_info *info,
struct phy_tdr_config *cfg)
{
- struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1];
+ struct nlattr *tb[ARRAY_SIZE(cable_test_tdr_act_cfg_policy)];
int ret;
cfg->first = 100;
@@ -249,8 +240,10 @@ static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest,
if (!nest)
return 0;
- ret = nla_parse_nested(tb, ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX, nest,
- cable_test_tdr_act_cfg_policy, info->extack);
+ ret = nla_parse_nested(tb,
+ ARRAY_SIZE(cable_test_tdr_act_cfg_policy) - 1,
+ nest, cable_test_tdr_act_cfg_policy,
+ info->extack);
if (ret < 0)
return ret;
@@ -313,19 +306,13 @@ static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest,
int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1];
struct ethnl_req_info req_info = {};
const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
struct phy_tdr_config cfg;
struct net_device *dev;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
- ETHTOOL_A_CABLE_TEST_TDR_MAX,
- cable_test_tdr_act_policy, info->extack);
- if (ret < 0)
- return ret;
-
ret = ethnl_parse_header_dev_get(&req_info,
tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER],
genl_info_net(info), info->extack,
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
index 9ef54cdcf662..25a9e566ef5c 100644
--- a/net/ethtool/channels.c
+++ b/net/ethtool/channels.c
@@ -17,18 +17,9 @@ struct channels_reply_data {
#define CHANNELS_REPDATA(__reply_base) \
container_of(__reply_base, struct channels_reply_data, base)
-static const struct nla_policy
-channels_get_policy[ETHTOOL_A_CHANNELS_MAX + 1] = {
- [ETHTOOL_A_CHANNELS_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_CHANNELS_RX_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_TX_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_OTHER_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_COMBINED_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_channels_get_policy[] = {
+ [ETHTOOL_A_CHANNELS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int channels_prepare_data(const struct ethnl_req_info *req_base,
@@ -99,10 +90,8 @@ const struct ethnl_request_ops ethnl_channels_request_ops = {
.request_cmd = ETHTOOL_MSG_CHANNELS_GET,
.reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY,
.hdr_attr = ETHTOOL_A_CHANNELS_HEADER,
- .max_attr = ETHTOOL_A_CHANNELS_MAX,
.req_info_size = sizeof(struct channels_req_info),
.reply_data_size = sizeof(struct channels_reply_data),
- .request_policy = channels_get_policy,
.prepare_data = channels_prepare_data,
.reply_size = channels_reply_size,
@@ -111,14 +100,9 @@ const struct ethnl_request_ops ethnl_channels_request_ops = {
/* CHANNELS_SET */
-static const struct nla_policy
-channels_set_policy[ETHTOOL_A_CHANNELS_MAX + 1] = {
- [ETHTOOL_A_CHANNELS_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_CHANNELS_RX_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_TX_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_OTHER_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_CHANNELS_COMBINED_MAX] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_channels_set_policy[] = {
+ [ETHTOOL_A_CHANNELS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_U32 },
[ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_U32 },
[ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_U32 },
@@ -127,22 +111,17 @@ channels_set_policy[ETHTOOL_A_CHANNELS_MAX + 1] = {
int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_CHANNELS_MAX + 1];
unsigned int from_channel, old_total, i;
bool mod = false, mod_combined = false;
struct ethtool_channels channels = {};
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
const struct nlattr *err_attr;
const struct ethtool_ops *ops;
struct net_device *dev;
u32 max_rx_in_use = 0;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
- ETHTOOL_A_CHANNELS_MAX, channels_set_policy,
- info->extack);
- if (ret < 0)
- return ret;
ret = ethnl_parse_header_dev_get(&req_info,
tb[ETHTOOL_A_CHANNELS_HEADER],
genl_info_net(info), info->extack,
@@ -215,17 +194,19 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
if (netif_is_rxfh_configured(dev) &&
!ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
(channels.combined_count + channels.rx_count) <= max_rx_in_use) {
+ ret = -EINVAL;
GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings");
- return -EINVAL;
+ goto out_ops;
}
/* Disabling channels, query zero-copy AF_XDP sockets */
from_channel = channels.combined_count +
min(channels.rx_count, channels.tx_count);
for (i = from_channel; i < old_total; i++)
- if (xdp_get_umem_from_qid(dev, i)) {
+ if (xsk_get_pool_from_qid(dev, i)) {
+ ret = -EINVAL;
GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets");
- return -EINVAL;
+ goto out_ops;
}
ret = dev->ethtool_ops->set_channels(dev, &channels);
diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c
index 6afd99042d67..1d6bc132aa4d 100644
--- a/net/ethtool/coalesce.c
+++ b/net/ethtool/coalesce.c
@@ -51,32 +51,9 @@ __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_HIGH);
__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_HIGH);
__CHECK_SUPPORTED_OFFSET(COALESCE_RATE_SAMPLE_INTERVAL);
-static const struct nla_policy
-coalesce_get_policy[ETHTOOL_A_COALESCE_MAX + 1] = {
- [ETHTOOL_A_COALESCE_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_coalesce_get_policy[] = {
+ [ETHTOOL_A_COALESCE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int coalesce_prepare_data(const struct ethnl_req_info *req_base,
@@ -203,10 +180,8 @@ const struct ethnl_request_ops ethnl_coalesce_request_ops = {
.request_cmd = ETHTOOL_MSG_COALESCE_GET,
.reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY,
.hdr_attr = ETHTOOL_A_COALESCE_HEADER,
- .max_attr = ETHTOOL_A_COALESCE_MAX,
.req_info_size = sizeof(struct coalesce_req_info),
.reply_data_size = sizeof(struct coalesce_reply_data),
- .request_policy = coalesce_get_policy,
.prepare_data = coalesce_prepare_data,
.reply_size = coalesce_reply_size,
@@ -215,10 +190,9 @@ const struct ethnl_request_ops ethnl_coalesce_request_ops = {
/* COALESCE_SET */
-static const struct nla_policy
-coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1] = {
- [ETHTOOL_A_COALESCE_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_COALESCE_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_coalesce_set_policy[] = {
+ [ETHTOOL_A_COALESCE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_U32 },
[ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_U32 },
[ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_U32 },
@@ -245,9 +219,9 @@ coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1] = {
int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_COALESCE_MAX + 1];
struct ethtool_coalesce coalesce = {};
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
const struct ethtool_ops *ops;
struct net_device *dev;
u32 supported_params;
@@ -255,11 +229,6 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info)
int ret;
u16 a;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
- ETHTOOL_A_COALESCE_MAX, coalesce_set_policy,
- info->extack);
- if (ret < 0)
- return ret;
ret = ethnl_parse_header_dev_get(&req_info,
tb[ETHTOOL_A_COALESCE_HEADER],
genl_info_net(info), info->extack,
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index ed19573fccd7..24036e3055a1 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -192,6 +192,8 @@ const char link_mode_names[][ETH_GSTRING_LEN] = {
__DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full),
__DEFINE_LINK_MODE_NAME(400000, DR4, Full),
__DEFINE_LINK_MODE_NAME(400000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(100, FX, Half),
+ __DEFINE_LINK_MODE_NAME(100, FX, Full),
};
static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c
index 1bd026a29f3f..f99912d7957e 100644
--- a/net/ethtool/debug.c
+++ b/net/ethtool/debug.c
@@ -16,11 +16,9 @@ struct debug_reply_data {
#define DEBUG_REPDATA(__reply_base) \
container_of(__reply_base, struct debug_reply_data, base)
-static const struct nla_policy
-debug_get_policy[ETHTOOL_A_DEBUG_MAX + 1] = {
- [ETHTOOL_A_DEBUG_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_DEBUG_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_debug_get_policy[] = {
+ [ETHTOOL_A_DEBUG_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int debug_prepare_data(const struct ethnl_req_info *req_base,
@@ -69,10 +67,8 @@ const struct ethnl_request_ops ethnl_debug_request_ops = {
.request_cmd = ETHTOOL_MSG_DEBUG_GET,
.reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY,
.hdr_attr = ETHTOOL_A_DEBUG_HEADER,
- .max_attr = ETHTOOL_A_DEBUG_MAX,
.req_info_size = sizeof(struct debug_req_info),
.reply_data_size = sizeof(struct debug_reply_data),
- .request_policy = debug_get_policy,
.prepare_data = debug_prepare_data,
.reply_size = debug_reply_size,
@@ -81,27 +77,21 @@ const struct ethnl_request_ops ethnl_debug_request_ops = {
/* DEBUG_SET */
-static const struct nla_policy
-debug_set_policy[ETHTOOL_A_DEBUG_MAX + 1] = {
- [ETHTOOL_A_DEBUG_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_DEBUG_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_debug_set_policy[] = {
+ [ETHTOOL_A_DEBUG_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED },
};
int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_DEBUG_MAX + 1];
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
struct net_device *dev;
bool mod = false;
u32 msg_mask;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
- ETHTOOL_A_DEBUG_MAX, debug_set_policy,
- info->extack);
- if (ret < 0)
- return ret;
ret = ethnl_parse_header_dev_get(&req_info,
tb[ETHTOOL_A_DEBUG_HEADER],
genl_info_net(info), info->extack,
diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c
index 94aa19cff22f..901b7de941ab 100644
--- a/net/ethtool/eee.c
+++ b/net/ethtool/eee.c
@@ -19,16 +19,9 @@ struct eee_reply_data {
#define EEE_REPDATA(__reply_base) \
container_of(__reply_base, struct eee_reply_data, base)
-static const struct nla_policy
-eee_get_policy[ETHTOOL_A_EEE_MAX + 1] = {
- [ETHTOOL_A_EEE_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_EEE_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_EEE_MODES_OURS] = { .type = NLA_REJECT },
- [ETHTOOL_A_EEE_MODES_PEER] = { .type = NLA_REJECT },
- [ETHTOOL_A_EEE_ACTIVE] = { .type = NLA_REJECT },
- [ETHTOOL_A_EEE_ENABLED] = { .type = NLA_REJECT },
- [ETHTOOL_A_EEE_TX_LPI_ENABLED] = { .type = NLA_REJECT },
- [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_eee_get_policy[] = {
+ [ETHTOOL_A_EEE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int eee_prepare_data(const struct ethnl_req_info *req_base,
@@ -119,10 +112,8 @@ const struct ethnl_request_ops ethnl_eee_request_ops = {
.request_cmd = ETHTOOL_MSG_EEE_GET,
.reply_cmd = ETHTOOL_MSG_EEE_GET_REPLY,
.hdr_attr = ETHTOOL_A_EEE_HEADER,
- .max_attr = ETHTOOL_A_EEE_MAX,
.req_info_size = sizeof(struct eee_req_info),
.reply_data_size = sizeof(struct eee_reply_data),
- .request_policy = eee_get_policy,
.prepare_data = eee_prepare_data,
.reply_size = eee_reply_size,
@@ -131,13 +122,10 @@ const struct ethnl_request_ops ethnl_eee_request_ops = {
/* EEE_SET */
-static const struct nla_policy
-eee_set_policy[ETHTOOL_A_EEE_MAX + 1] = {
- [ETHTOOL_A_EEE_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_EEE_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_eee_set_policy[] = {
+ [ETHTOOL_A_EEE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_EEE_MODES_OURS] = { .type = NLA_NESTED },
- [ETHTOOL_A_EEE_MODES_PEER] = { .type = NLA_REJECT },
- [ETHTOOL_A_EEE_ACTIVE] = { .type = NLA_REJECT },
[ETHTOOL_A_EEE_ENABLED] = { .type = NLA_U8 },
[ETHTOOL_A_EEE_TX_LPI_ENABLED] = { .type = NLA_U8 },
[ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_U32 },
@@ -145,18 +133,14 @@ eee_set_policy[ETHTOOL_A_EEE_MAX + 1] = {
int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_EEE_MAX + 1];
- struct ethtool_eee eee = {};
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
const struct ethtool_ops *ops;
+ struct ethtool_eee eee = {};
struct net_device *dev;
bool mod = false;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_EEE_MAX,
- eee_set_policy, info->extack);
- if (ret < 0)
- return ret;
ret = ethnl_parse_header_dev_get(&req_info,
tb[ETHTOOL_A_EEE_HEADER],
genl_info_net(info), info->extack,
diff --git a/net/ethtool/features.c b/net/ethtool/features.c
index 495635f152ba..1c9f4df273bd 100644
--- a/net/ethtool/features.c
+++ b/net/ethtool/features.c
@@ -20,14 +20,9 @@ struct features_reply_data {
#define FEATURES_REPDATA(__reply_base) \
container_of(__reply_base, struct features_reply_data, base)
-static const struct nla_policy
-features_get_policy[ETHTOOL_A_FEATURES_MAX + 1] = {
- [ETHTOOL_A_FEATURES_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_FEATURES_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_FEATURES_HW] = { .type = NLA_REJECT },
- [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_REJECT },
- [ETHTOOL_A_FEATURES_ACTIVE] = { .type = NLA_REJECT },
- [ETHTOOL_A_FEATURES_NOCHANGE] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_features_get_policy[] = {
+ [ETHTOOL_A_FEATURES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src)
@@ -120,10 +115,8 @@ const struct ethnl_request_ops ethnl_features_request_ops = {
.request_cmd = ETHTOOL_MSG_FEATURES_GET,
.reply_cmd = ETHTOOL_MSG_FEATURES_GET_REPLY,
.hdr_attr = ETHTOOL_A_FEATURES_HEADER,
- .max_attr = ETHTOOL_A_FEATURES_MAX,
.req_info_size = sizeof(struct features_req_info),
.reply_data_size = sizeof(struct features_reply_data),
- .request_policy = features_get_policy,
.prepare_data = features_prepare_data,
.reply_size = features_reply_size,
@@ -132,14 +125,10 @@ const struct ethnl_request_ops ethnl_features_request_ops = {
/* FEATURES_SET */
-static const struct nla_policy
-features_set_policy[ETHTOOL_A_FEATURES_MAX + 1] = {
- [ETHTOOL_A_FEATURES_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_FEATURES_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_FEATURES_HW] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_features_set_policy[] = {
+ [ETHTOOL_A_FEATURES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_NESTED },
- [ETHTOOL_A_FEATURES_ACTIVE] = { .type = NLA_REJECT },
- [ETHTOOL_A_FEATURES_NOCHANGE] = { .type = NLA_REJECT },
};
static void ethnl_features_to_bitmap(unsigned long *dest, netdev_features_t val)
@@ -229,17 +218,12 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info)
DECLARE_BITMAP(new_wanted, NETDEV_FEATURE_COUNT);
DECLARE_BITMAP(req_wanted, NETDEV_FEATURE_COUNT);
DECLARE_BITMAP(req_mask, NETDEV_FEATURE_COUNT);
- struct nlattr *tb[ETHTOOL_A_FEATURES_MAX + 1];
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
struct net_device *dev;
bool mod;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
- ETHTOOL_A_FEATURES_MAX, features_set_policy,
- info->extack);
- if (ret < 0)
- return ret;
if (!tb[ETHTOOL_A_FEATURES_WANTED])
return -EINVAL;
ret = ethnl_parse_header_dev_get(&req_info,
@@ -296,7 +280,7 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info)
active_diff_mask, compact);
}
if (mod)
- ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL);
+ netdev_features_change(dev);
out_rtnl:
rtnl_unlock();
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 441794e0034f..771688e1b0da 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1706,7 +1706,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
min(channels.rx_count, channels.tx_count);
to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
for (i = from_channel; i < to_channel; i++)
- if (xdp_get_umem_from_qid(dev, i))
+ if (xsk_get_pool_from_qid(dev, i))
return -EINVAL;
ret = dev->ethtool_ops->set_channels(dev, &channels);
@@ -1861,23 +1861,18 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT);
} else {
/* Driver expects to be called at twice the frequency in rc */
- int n = rc * 2, i, interval = HZ / n;
+ int n = rc * 2, interval = HZ / n;
+ u64 count = n * id.data, i = 0;
- /* Count down seconds */
do {
- /* Count down iterations per second */
- i = n;
- do {
- rtnl_lock();
- rc = ops->set_phys_id(dev,
- (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
- rtnl_unlock();
- if (rc)
- break;
- schedule_timeout_interruptible(interval);
- } while (!signal_pending(current) && --i != 0);
- } while (!signal_pending(current) &&
- (id.data == 0 || --id.data != 0));
+ rtnl_lock();
+ rc = ops->set_phys_id(dev,
+ (i++ & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
+ rtnl_unlock();
+ if (rc)
+ break;
+ schedule_timeout_interruptible(interval);
+ } while (!signal_pending(current) && (!id.data || i < count));
}
rtnl_lock();
@@ -2438,7 +2433,7 @@ static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev,
return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt);
default:
return -EOPNOTSUPP;
- };
+ }
}
static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
@@ -2464,14 +2459,15 @@ static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
{
- int ret;
- struct ethtool_tunable tuna;
struct phy_device *phydev = dev->phydev;
+ struct ethtool_tunable tuna;
+ bool phy_drv_tunable;
void *data;
+ int ret;
- if (!(phydev && phydev->drv && phydev->drv->get_tunable))
+ phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable;
+ if (!phy_drv_tunable && !dev->ethtool_ops->get_phy_tunable)
return -EOPNOTSUPP;
-
if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
return -EFAULT;
ret = ethtool_phy_tunable_valid(&tuna);
@@ -2480,9 +2476,13 @@ static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
data = kmalloc(tuna.len, GFP_USER);
if (!data)
return -ENOMEM;
- mutex_lock(&phydev->lock);
- ret = phydev->drv->get_tunable(phydev, &tuna, data);
- mutex_unlock(&phydev->lock);
+ if (phy_drv_tunable) {
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ } else {
+ ret = dev->ethtool_ops->get_phy_tunable(dev, &tuna, data);
+ }
if (ret)
goto out;
useraddr += sizeof(tuna);
@@ -2498,12 +2498,14 @@ out:
static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
{
- int ret;
- struct ethtool_tunable tuna;
struct phy_device *phydev = dev->phydev;
+ struct ethtool_tunable tuna;
+ bool phy_drv_tunable;
void *data;
+ int ret;
- if (!(phydev && phydev->drv && phydev->drv->set_tunable))
+ phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable;
+ if (!phy_drv_tunable && !dev->ethtool_ops->set_phy_tunable)
return -EOPNOTSUPP;
if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
return -EFAULT;
@@ -2514,9 +2516,13 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
data = memdup_user(useraddr, tuna.len);
if (IS_ERR(data))
return PTR_ERR(data);
- mutex_lock(&phydev->lock);
- ret = phydev->drv->set_tunable(phydev, &tuna, data);
- mutex_unlock(&phydev->lock);
+ if (phy_drv_tunable) {
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->set_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ } else {
+ ret = dev->ethtool_ops->set_phy_tunable(dev, &tuna, data);
+ }
kfree(data);
return ret;
@@ -3025,13 +3031,14 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
case TCP_V4_FLOW:
case TCP_V6_FLOW:
match->key.basic.ip_proto = IPPROTO_TCP;
+ match->mask.basic.ip_proto = 0xff;
break;
case UDP_V4_FLOW:
case UDP_V6_FLOW:
match->key.basic.ip_proto = IPPROTO_UDP;
+ match->mask.basic.ip_proto = 0xff;
break;
}
- match->mask.basic.ip_proto = 0xff;
match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC);
match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] =
diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c
index 5eaf173eaaca..b91839870efc 100644
--- a/net/ethtool/linkinfo.c
+++ b/net/ethtool/linkinfo.c
@@ -16,15 +16,9 @@ struct linkinfo_reply_data {
#define LINKINFO_REPDATA(__reply_base) \
container_of(__reply_base, struct linkinfo_reply_data, base)
-static const struct nla_policy
-linkinfo_get_policy[ETHTOOL_A_LINKINFO_MAX + 1] = {
- [ETHTOOL_A_LINKINFO_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKINFO_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKINFO_TP_MDIX] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKINFO_TRANSCEIVER] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_linkinfo_get_policy[] = {
+ [ETHTOOL_A_LINKINFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int linkinfo_prepare_data(const struct ethnl_req_info *req_base,
@@ -83,10 +77,8 @@ const struct ethnl_request_ops ethnl_linkinfo_request_ops = {
.request_cmd = ETHTOOL_MSG_LINKINFO_GET,
.reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY,
.hdr_attr = ETHTOOL_A_LINKINFO_HEADER,
- .max_attr = ETHTOOL_A_LINKINFO_MAX,
.req_info_size = sizeof(struct linkinfo_req_info),
.reply_data_size = sizeof(struct linkinfo_reply_data),
- .request_policy = linkinfo_get_policy,
.prepare_data = linkinfo_prepare_data,
.reply_size = linkinfo_reply_size,
@@ -95,32 +87,24 @@ const struct ethnl_request_ops ethnl_linkinfo_request_ops = {
/* LINKINFO_SET */
-static const struct nla_policy
-linkinfo_set_policy[ETHTOOL_A_LINKINFO_MAX + 1] = {
- [ETHTOOL_A_LINKINFO_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKINFO_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_linkinfo_set_policy[] = {
+ [ETHTOOL_A_LINKINFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 },
[ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 },
- [ETHTOOL_A_LINKINFO_TP_MDIX] = { .type = NLA_REJECT },
[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 },
- [ETHTOOL_A_LINKINFO_TRANSCEIVER] = { .type = NLA_REJECT },
};
int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_LINKINFO_MAX + 1];
struct ethtool_link_ksettings ksettings = {};
struct ethtool_link_settings *lsettings;
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
struct net_device *dev;
bool mod = false;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
- ETHTOOL_A_LINKINFO_MAX, linkinfo_set_policy,
- info->extack);
- if (ret < 0)
- return ret;
ret = ethnl_parse_header_dev_get(&req_info,
tb[ETHTOOL_A_LINKINFO_HEADER],
genl_info_net(info), info->extack,
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index 7044a2853886..c5bcb9abc8b9 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -18,17 +18,9 @@ struct linkmodes_reply_data {
#define LINKMODES_REPDATA(__reply_base) \
container_of(__reply_base, struct linkmodes_reply_data, base)
-static const struct nla_policy
-linkmodes_get_policy[ETHTOOL_A_LINKMODES_MAX + 1] = {
- [ETHTOOL_A_LINKMODES_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKMODES_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_linkmodes_get_policy[] = {
+ [ETHTOOL_A_LINKMODES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int linkmodes_prepare_data(const struct ethnl_req_info *req_base,
@@ -148,10 +140,8 @@ const struct ethnl_request_ops ethnl_linkmodes_request_ops = {
.request_cmd = ETHTOOL_MSG_LINKMODES_GET,
.reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY,
.hdr_attr = ETHTOOL_A_LINKMODES_HEADER,
- .max_attr = ETHTOOL_A_LINKMODES_MAX,
.req_info_size = sizeof(struct linkmodes_req_info),
.reply_data_size = sizeof(struct linkmodes_reply_data),
- .request_policy = linkmodes_get_policy,
.prepare_data = linkmodes_prepare_data,
.reply_size = linkmodes_reply_size,
@@ -272,19 +262,18 @@ static const struct link_mode_info link_mode_params[] = {
__DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full),
__DEFINE_LINK_MODE_PARAMS(400000, DR4, Full),
__DEFINE_LINK_MODE_PARAMS(400000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100, FX, Half),
+ __DEFINE_LINK_MODE_PARAMS(100, FX, Full),
};
-static const struct nla_policy
-linkmodes_set_policy[ETHTOOL_A_LINKMODES_MAX + 1] = {
- [ETHTOOL_A_LINKMODES_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKMODES_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_linkmodes_set_policy[] = {
+ [ETHTOOL_A_LINKMODES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_U8 },
[ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_NESTED },
- [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT },
[ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 },
[ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 },
[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 },
- [ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE] = { .type = NLA_REJECT },
};
/* Set advertised link modes to all supported modes matching requested speed
@@ -390,18 +379,13 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_LINKMODES_MAX + 1];
struct ethtool_link_ksettings ksettings = {};
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
struct net_device *dev;
bool mod = false;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
- ETHTOOL_A_LINKMODES_MAX, linkmodes_set_policy,
- info->extack);
- if (ret < 0)
- return ret;
ret = ethnl_parse_header_dev_get(&req_info,
tb[ETHTOOL_A_LINKMODES_HEADER],
genl_info_net(info), info->extack,
diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c
index 4834091ec24c..fb676f349455 100644
--- a/net/ethtool/linkstate.c
+++ b/net/ethtool/linkstate.c
@@ -20,15 +20,9 @@ struct linkstate_reply_data {
#define LINKSTATE_REPDATA(__reply_base) \
container_of(__reply_base, struct linkstate_reply_data, base)
-static const struct nla_policy
-linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = {
- [ETHTOOL_A_LINKSTATE_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKSTATE_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKSTATE_SQI] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKSTATE_SQI_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKSTATE_EXT_STATE] = { .type = NLA_REJECT },
- [ETHTOOL_A_LINKSTATE_EXT_SUBSTATE] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_linkstate_get_policy[] = {
+ [ETHTOOL_A_LINKSTATE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int linkstate_get_sqi(struct net_device *dev)
@@ -179,10 +173,8 @@ const struct ethnl_request_ops ethnl_linkstate_request_ops = {
.request_cmd = ETHTOOL_MSG_LINKSTATE_GET,
.reply_cmd = ETHTOOL_MSG_LINKSTATE_GET_REPLY,
.hdr_attr = ETHTOOL_A_LINKSTATE_HEADER,
- .max_attr = ETHTOOL_A_LINKSTATE_MAX,
.req_info_size = sizeof(struct linkstate_req_info),
.reply_data_size = sizeof(struct linkstate_reply_data),
- .request_policy = linkstate_get_policy,
.prepare_data = linkstate_prepare_data,
.reply_size = linkstate_reply_size,
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 0c3f54baec4e..50d3c8896f91 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -9,12 +9,24 @@ static struct genl_family ethtool_genl_family;
static bool ethnl_ok __read_mostly;
static u32 ethnl_bcast_seq;
-static const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_MAX + 1] = {
- [ETHTOOL_A_HEADER_UNSPEC] = { .type = NLA_REJECT },
+#define ETHTOOL_FLAGS_BASIC (ETHTOOL_FLAG_COMPACT_BITSETS | \
+ ETHTOOL_FLAG_OMIT_REPLY)
+#define ETHTOOL_FLAGS_STATS (ETHTOOL_FLAGS_BASIC | ETHTOOL_FLAG_STATS)
+
+const struct nla_policy ethnl_header_policy[] = {
[ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
[ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
.len = ALTIFNAMSIZ - 1 },
- [ETHTOOL_A_HEADER_FLAGS] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ ETHTOOL_FLAGS_BASIC),
+};
+
+const struct nla_policy ethnl_header_policy_stats[] = {
+ [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ ETHTOOL_FLAGS_STATS),
};
/**
@@ -37,7 +49,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
const struct nlattr *header, struct net *net,
struct netlink_ext_ack *extack, bool require_dev)
{
- struct nlattr *tb[ETHTOOL_A_HEADER_MAX + 1];
+ struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy)];
const struct nlattr *devname_attr;
struct net_device *dev = NULL;
u32 flags = 0;
@@ -47,19 +59,15 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
NL_SET_ERR_MSG(extack, "request header missing");
return -EINVAL;
}
- ret = nla_parse_nested(tb, ETHTOOL_A_HEADER_MAX, header,
- ethnl_header_policy, extack);
+ /* No validation here, command policy should have a nested policy set
+ * for the header, therefore validation should have already been done.
+ */
+ ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy) - 1, header,
+ NULL, extack);
if (ret < 0)
return ret;
- if (tb[ETHTOOL_A_HEADER_FLAGS]) {
+ if (tb[ETHTOOL_A_HEADER_FLAGS])
flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]);
- if (flags & ~ETHTOOL_FLAG_ALL) {
- NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_FLAGS],
- "unrecognized request flags");
- nl_set_extack_cookie_u32(extack, ETHTOOL_FLAG_ALL);
- return -EOPNOTSUPP;
- }
- }
devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME];
if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) {
@@ -247,7 +255,7 @@ static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
/**
* ethnl_default_parse() - Parse request message
* @req_info: pointer to structure to put data into
- * @nlhdr: pointer to request message header
+ * @tb: parsed attributes
* @net: request netns
* @request_ops: struct request_ops for request type
* @extack: netlink extack for error reporting
@@ -259,37 +267,24 @@ static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
* Return: 0 on success or negative error code
*/
static int ethnl_default_parse(struct ethnl_req_info *req_info,
- const struct nlmsghdr *nlhdr, struct net *net,
+ struct nlattr **tb, struct net *net,
const struct ethnl_request_ops *request_ops,
struct netlink_ext_ack *extack, bool require_dev)
{
- struct nlattr **tb;
int ret;
- tb = kmalloc_array(request_ops->max_attr + 1, sizeof(tb[0]),
- GFP_KERNEL);
- if (!tb)
- return -ENOMEM;
-
- ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, request_ops->max_attr,
- request_ops->request_policy, extack);
- if (ret < 0)
- goto out;
ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr],
net, extack, require_dev);
if (ret < 0)
- goto out;
+ return ret;
if (request_ops->parse_request) {
ret = request_ops->parse_request(req_info, tb, extack);
if (ret < 0)
- goto out;
+ return ret;
}
- ret = 0;
-out:
- kfree(tb);
- return ret;
+ return 0;
}
/**
@@ -334,8 +329,8 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info)
return -ENOMEM;
}
- ret = ethnl_default_parse(req_info, info->nlhdr, genl_info_net(info), ops,
- info->extack, !ops->allow_nodev_do);
+ ret = ethnl_default_parse(req_info, info->attrs, genl_info_net(info),
+ ops, info->extack, !ops->allow_nodev_do);
if (ret < 0)
goto err_dev;
ethnl_init_reply_data(reply_data, ops, req_info->dev);
@@ -480,6 +475,7 @@ out:
/* generic ->start() handler for GET requests */
static int ethnl_default_start(struct netlink_callback *cb)
{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
struct ethnl_reply_data *reply_data;
const struct ethnl_request_ops *ops;
@@ -502,8 +498,8 @@ static int ethnl_default_start(struct netlink_callback *cb)
goto free_req_info;
}
- ret = ethnl_default_parse(req_info, cb->nlh, sock_net(cb->skb->sk), ops,
- cb->extack, false);
+ ret = ethnl_default_parse(req_info, info->attrs, sock_net(cb->skb->sk),
+ ops, cb->extack, false);
if (req_info->dev) {
/* We ignore device specification in dump requests but as the
* same parser as for non-dump (doit) requests is used, it
@@ -696,6 +692,8 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_strset_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_strset_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_LINKINFO_GET,
@@ -703,11 +701,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_linkinfo_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkinfo_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_LINKINFO_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_linkinfo,
+ .policy = ethnl_linkinfo_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkinfo_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_LINKMODES_GET,
@@ -715,11 +717,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_linkmodes_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkmodes_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_LINKMODES_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_linkmodes,
+ .policy = ethnl_linkmodes_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkmodes_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_LINKSTATE_GET,
@@ -727,6 +733,8 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_linkstate_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkstate_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_DEBUG_GET,
@@ -734,11 +742,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_debug_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_debug_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_DEBUG_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_debug,
+ .policy = ethnl_debug_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_debug_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_WOL_GET,
@@ -747,11 +759,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_wol_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_wol_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_WOL_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_wol,
+ .policy = ethnl_wol_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_wol_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_FEATURES_GET,
@@ -759,11 +775,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_features_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_features_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_FEATURES_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_features,
+ .policy = ethnl_features_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_features_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_PRIVFLAGS_GET,
@@ -771,11 +791,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_privflags_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_privflags_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_PRIVFLAGS_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_privflags,
+ .policy = ethnl_privflags_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_privflags_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_RINGS_GET,
@@ -783,11 +807,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_rings_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rings_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_RINGS_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_rings,
+ .policy = ethnl_rings_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rings_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_CHANNELS_GET,
@@ -795,11 +823,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_channels_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_channels_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_CHANNELS_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_channels,
+ .policy = ethnl_channels_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_channels_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_COALESCE_GET,
@@ -807,11 +839,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_coalesce_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_coalesce_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_COALESCE_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_coalesce,
+ .policy = ethnl_coalesce_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_coalesce_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_PAUSE_GET,
@@ -819,11 +855,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_pause_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_pause_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_PAUSE_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_pause,
+ .policy = ethnl_pause_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_pause_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_EEE_GET,
@@ -831,11 +871,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_eee_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_eee_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_EEE_SET,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_eee,
+ .policy = ethnl_eee_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_eee_set_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_TSINFO_GET,
@@ -843,22 +887,30 @@ static const struct genl_ops ethtool_genl_ops[] = {
.start = ethnl_default_start,
.dumpit = ethnl_default_dumpit,
.done = ethnl_default_done,
+ .policy = ethnl_tsinfo_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_CABLE_TEST_ACT,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_act_cable_test,
+ .policy = ethnl_cable_test_act_policy,
+ .maxattr = ARRAY_SIZE(ethnl_cable_test_act_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT,
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_act_cable_test_tdr,
+ .policy = ethnl_cable_test_tdr_act_policy,
+ .maxattr = ARRAY_SIZE(ethnl_cable_test_tdr_act_policy) - 1,
},
{
.cmd = ETHTOOL_MSG_TUNNEL_INFO_GET,
.doit = ethnl_tunnel_info_doit,
.start = ethnl_tunnel_info_start,
.dumpit = ethnl_tunnel_info_dumpit,
+ .policy = ethnl_tunnel_info_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1,
},
};
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index e2085005caac..d8efec516d86 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -266,10 +266,8 @@ static inline void ethnl_ops_complete(struct net_device *dev)
* @request_cmd: command id for request (GET)
* @reply_cmd: command id for reply (GET_REPLY)
* @hdr_attr: attribute type for request header
- * @max_attr: maximum (top level) attribute type
* @req_info_size: size of request info
* @reply_data_size: size of reply data
- * @request_policy: netlink policy for message contents
* @allow_nodev_do: allow non-dump request with no device identification
* @parse_request:
* Parse request except common header (struct ethnl_req_info). Common
@@ -312,10 +310,8 @@ struct ethnl_request_ops {
u8 request_cmd;
u8 reply_cmd;
u16 hdr_attr;
- unsigned int max_attr;
unsigned int req_info_size;
unsigned int reply_data_size;
- const struct nla_policy *request_policy;
bool allow_nodev_do;
int (*parse_request)(struct ethnl_req_info *req_info,
@@ -349,6 +345,37 @@ extern const struct ethnl_request_ops ethnl_pause_request_ops;
extern const struct ethnl_request_ops ethnl_eee_request_ops;
extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
+extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
+extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
+extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1];
+extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1];
+extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1];
+extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1];
+extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG + 1];
+extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1];
+extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1];
+extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1];
+extern const struct nla_policy ethnl_wol_get_policy[ETHTOOL_A_WOL_HEADER + 1];
+extern const struct nla_policy ethnl_wol_set_policy[ETHTOOL_A_WOL_SOPASS + 1];
+extern const struct nla_policy ethnl_features_get_policy[ETHTOOL_A_FEATURES_HEADER + 1];
+extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANTED + 1];
+extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1];
+extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1];
+extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1];
+extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX + 1];
+extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
+extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
+extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
+extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL + 1];
+extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1];
+extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1];
+extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1];
+extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1];
+extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + 1];
+extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1];
+extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1];
+extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1];
+
int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c
index 7aea35d1e8a5..09998dc5c185 100644
--- a/net/ethtool/pause.c
+++ b/net/ethtool/pause.c
@@ -10,20 +10,23 @@ struct pause_req_info {
struct pause_reply_data {
struct ethnl_reply_data base;
struct ethtool_pauseparam pauseparam;
+ struct ethtool_pause_stats pausestat;
};
#define PAUSE_REPDATA(__reply_base) \
container_of(__reply_base, struct pause_reply_data, base)
-static const struct nla_policy
-pause_get_policy[ETHTOOL_A_PAUSE_MAX + 1] = {
- [ETHTOOL_A_PAUSE_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_PAUSE_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_REJECT },
- [ETHTOOL_A_PAUSE_RX] = { .type = NLA_REJECT },
- [ETHTOOL_A_PAUSE_TX] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_pause_get_policy[] = {
+ [ETHTOOL_A_PAUSE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_stats),
};
+static void ethtool_stats_init(u64 *stats, unsigned int n)
+{
+ while (n--)
+ stats[n] = ETHTOOL_STAT_NOT_SET;
+}
+
static int pause_prepare_data(const struct ethnl_req_info *req_base,
struct ethnl_reply_data *reply_base,
struct genl_info *info)
@@ -34,10 +37,17 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base,
if (!dev->ethtool_ops->get_pauseparam)
return -EOPNOTSUPP;
+
ret = ethnl_ops_begin(dev);
if (ret < 0)
return ret;
dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam);
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ dev->ethtool_ops->get_pause_stats) {
+ ethtool_stats_init((u64 *)&data->pausestat,
+ sizeof(data->pausestat) / 8);
+ dev->ethtool_ops->get_pause_stats(dev, &data->pausestat);
+ }
ethnl_ops_complete(dev);
return 0;
@@ -46,9 +56,50 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base,
static int pause_reply_size(const struct ethnl_req_info *req_base,
const struct ethnl_reply_data *reply_base)
{
- return nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */
+ int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */
nla_total_size(sizeof(u8)) + /* _PAUSE_RX */
nla_total_size(sizeof(u8)); /* _PAUSE_TX */
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS)
+ n += nla_total_size(0) + /* _PAUSE_STATS */
+ nla_total_size_64bit(sizeof(u64)) *
+ (ETHTOOL_A_PAUSE_STAT_MAX - 2);
+ return n;
+}
+
+static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype,
+ u16 padtype)
+{
+ if (val == ETHTOOL_STAT_NOT_SET)
+ return 0;
+ if (nla_put_u64_64bit(skb, attrtype, val, padtype))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int pause_put_stats(struct sk_buff *skb,
+ const struct ethtool_pause_stats *pause_stats)
+{
+ const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (ethtool_put_stat(skb, pause_stats->tx_pause_frames,
+ ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) ||
+ ethtool_put_stat(skb, pause_stats->rx_pause_frames,
+ ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
}
static int pause_fill_reply(struct sk_buff *skb,
@@ -63,6 +114,10 @@ static int pause_fill_reply(struct sk_buff *skb,
nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause))
return -EMSGSIZE;
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ pause_put_stats(skb, &data->pausestat))
+ return -EMSGSIZE;
+
return 0;
}
@@ -70,10 +125,8 @@ const struct ethnl_request_ops ethnl_pause_request_ops = {
.request_cmd = ETHTOOL_MSG_PAUSE_GET,
.reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY,
.hdr_attr = ETHTOOL_A_PAUSE_HEADER,
- .max_attr = ETHTOOL_A_PAUSE_MAX,
.req_info_size = sizeof(struct pause_req_info),
.reply_data_size = sizeof(struct pause_reply_data),
- .request_policy = pause_get_policy,
.prepare_data = pause_prepare_data,
.reply_size = pause_reply_size,
@@ -82,10 +135,9 @@ const struct ethnl_request_ops ethnl_pause_request_ops = {
/* PAUSE_SET */
-static const struct nla_policy
-pause_set_policy[ETHTOOL_A_PAUSE_MAX + 1] = {
- [ETHTOOL_A_PAUSE_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_PAUSE_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_pause_set_policy[] = {
+ [ETHTOOL_A_PAUSE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 },
[ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 },
[ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 },
@@ -93,18 +145,14 @@ pause_set_policy[ETHTOOL_A_PAUSE_MAX + 1] = {
int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_PAUSE_MAX + 1];
struct ethtool_pauseparam params = {};
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
const struct ethtool_ops *ops;
struct net_device *dev;
bool mod = false;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_PAUSE_MAX,
- pause_set_policy, info->extack);
- if (ret < 0)
- return ret;
ret = ethnl_parse_header_dev_get(&req_info,
tb[ETHTOOL_A_PAUSE_HEADER],
genl_info_net(info), info->extack,
diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c
index 77447dceb109..fc9f3be23a19 100644
--- a/net/ethtool/privflags.c
+++ b/net/ethtool/privflags.c
@@ -18,11 +18,9 @@ struct privflags_reply_data {
#define PRIVFLAGS_REPDATA(__reply_base) \
container_of(__reply_base, struct privflags_reply_data, base)
-static const struct nla_policy
-privflags_get_policy[ETHTOOL_A_PRIVFLAGS_MAX + 1] = {
- [ETHTOOL_A_PRIVFLAGS_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_PRIVFLAGS_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_privflags_get_policy[] = {
+ [ETHTOOL_A_PRIVFLAGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int ethnl_get_priv_flags_info(struct net_device *dev,
@@ -124,10 +122,8 @@ const struct ethnl_request_ops ethnl_privflags_request_ops = {
.request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET,
.reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY,
.hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER,
- .max_attr = ETHTOOL_A_PRIVFLAGS_MAX,
.req_info_size = sizeof(struct privflags_req_info),
.reply_data_size = sizeof(struct privflags_reply_data),
- .request_policy = privflags_get_policy,
.prepare_data = privflags_prepare_data,
.reply_size = privflags_reply_size,
@@ -137,18 +133,17 @@ const struct ethnl_request_ops ethnl_privflags_request_ops = {
/* PRIVFLAGS_SET */
-static const struct nla_policy
-privflags_set_policy[ETHTOOL_A_PRIVFLAGS_MAX + 1] = {
- [ETHTOOL_A_PRIVFLAGS_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_PRIVFLAGS_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_privflags_set_policy[] = {
+ [ETHTOOL_A_PRIVFLAGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_NESTED },
};
int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_PRIVFLAGS_MAX + 1];
const char (*names)[ETH_GSTRING_LEN] = NULL;
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
const struct ethtool_ops *ops;
struct net_device *dev;
unsigned int nflags;
@@ -157,11 +152,6 @@ int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info)
u32 flags;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
- ETHTOOL_A_PRIVFLAGS_MAX, privflags_set_policy,
- info->extack);
- if (ret < 0)
- return ret;
if (!tb[ETHTOOL_A_PRIVFLAGS_FLAGS])
return -EINVAL;
ret = ethnl_bitset_is_compact(tb[ETHTOOL_A_PRIVFLAGS_FLAGS], &compact);
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
index 5422526f4eef..4e097812a967 100644
--- a/net/ethtool/rings.c
+++ b/net/ethtool/rings.c
@@ -15,18 +15,9 @@ struct rings_reply_data {
#define RINGS_REPDATA(__reply_base) \
container_of(__reply_base, struct rings_reply_data, base)
-static const struct nla_policy
-rings_get_policy[ETHTOOL_A_RINGS_MAX + 1] = {
- [ETHTOOL_A_RINGS_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_RINGS_RX_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_RX_MINI_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_RX_JUMBO_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_TX_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_RX] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_TX] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_rings_get_policy[] = {
+ [ETHTOOL_A_RINGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int rings_prepare_data(const struct ethnl_req_info *req_base,
@@ -97,10 +88,8 @@ const struct ethnl_request_ops ethnl_rings_request_ops = {
.request_cmd = ETHTOOL_MSG_RINGS_GET,
.reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY,
.hdr_attr = ETHTOOL_A_RINGS_HEADER,
- .max_attr = ETHTOOL_A_RINGS_MAX,
.req_info_size = sizeof(struct rings_req_info),
.reply_data_size = sizeof(struct rings_reply_data),
- .request_policy = rings_get_policy,
.prepare_data = rings_prepare_data,
.reply_size = rings_reply_size,
@@ -109,14 +98,9 @@ const struct ethnl_request_ops ethnl_rings_request_ops = {
/* RINGS_SET */
-static const struct nla_policy
-rings_set_policy[ETHTOOL_A_RINGS_MAX + 1] = {
- [ETHTOOL_A_RINGS_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_RINGS_RX_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_RX_MINI_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_RX_JUMBO_MAX] = { .type = NLA_REJECT },
- [ETHTOOL_A_RINGS_TX_MAX] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_rings_set_policy[] = {
+ [ETHTOOL_A_RINGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 },
[ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 },
[ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 },
@@ -125,20 +109,15 @@ rings_set_policy[ETHTOOL_A_RINGS_MAX + 1] = {
int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *tb[ETHTOOL_A_RINGS_MAX + 1];
struct ethtool_ringparam ringparam = {};
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
const struct nlattr *err_attr;
const struct ethtool_ops *ops;
struct net_device *dev;
bool mod = false;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
- ETHTOOL_A_RINGS_MAX, rings_set_policy,
- info->extack);
- if (ret < 0)
- return ret;
ret = ethnl_parse_header_dev_get(&req_info,
tb[ETHTOOL_A_RINGS_HEADER],
genl_info_net(info), info->extack,
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index 82707b662fe4..c3a5489964cd 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -99,18 +99,15 @@ struct strset_reply_data {
#define STRSET_REPDATA(__reply_base) \
container_of(__reply_base, struct strset_reply_data, base)
-static const struct nla_policy strset_get_policy[ETHTOOL_A_STRSET_MAX + 1] = {
- [ETHTOOL_A_STRSET_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_STRSET_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_strset_get_policy[] = {
+ [ETHTOOL_A_STRSET_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_STRSET_COUNTS_ONLY] = { .type = NLA_FLAG },
};
-static const struct nla_policy
-get_stringset_policy[ETHTOOL_A_STRINGSET_MAX + 1] = {
- [ETHTOOL_A_STRINGSET_UNSPEC] = { .type = NLA_REJECT },
+static const struct nla_policy get_stringset_policy[] = {
[ETHTOOL_A_STRINGSET_ID] = { .type = NLA_U32 },
- [ETHTOOL_A_STRINGSET_COUNT] = { .type = NLA_REJECT },
- [ETHTOOL_A_STRINGSET_STRINGS] = { .type = NLA_REJECT },
};
/**
@@ -138,10 +135,10 @@ static bool strset_include(const struct strset_req_info *info,
static int strset_get_id(const struct nlattr *nest, u32 *val,
struct netlink_ext_ack *extack)
{
- struct nlattr *tb[ETHTOOL_A_STRINGSET_MAX + 1];
+ struct nlattr *tb[ARRAY_SIZE(get_stringset_policy)];
int ret;
- ret = nla_parse_nested(tb, ETHTOOL_A_STRINGSET_MAX, nest,
+ ret = nla_parse_nested(tb, ARRAY_SIZE(get_stringset_policy) - 1, nest,
get_stringset_policy, extack);
if (ret < 0)
return ret;
@@ -152,9 +149,7 @@ static int strset_get_id(const struct nlattr *nest, u32 *val,
return 0;
}
-static const struct nla_policy
-strset_stringsets_policy[ETHTOOL_A_STRINGSETS_MAX + 1] = {
- [ETHTOOL_A_STRINGSETS_UNSPEC] = { .type = NLA_REJECT },
+static const struct nla_policy strset_stringsets_policy[] = {
[ETHTOOL_A_STRINGSETS_STRINGSET] = { .type = NLA_NESTED },
};
@@ -169,7 +164,8 @@ static int strset_parse_request(struct ethnl_req_info *req_base,
if (!nest)
return 0;
- ret = nla_validate_nested(nest, ETHTOOL_A_STRINGSETS_MAX,
+ ret = nla_validate_nested(nest,
+ ARRAY_SIZE(strset_stringsets_policy) - 1,
strset_stringsets_policy, extack);
if (ret < 0)
return ret;
@@ -186,7 +182,7 @@ static int strset_parse_request(struct ethnl_req_info *req_base,
ret = strset_get_id(attr, &id, extack);
if (ret < 0)
return ret;
- if (ret >= ETH_SS_COUNT) {
+ if (id >= ETH_SS_COUNT) {
NL_SET_ERR_MSG_ATTR(extack, attr,
"unknown string set id");
return -EOPNOTSUPP;
@@ -445,10 +441,8 @@ const struct ethnl_request_ops ethnl_strset_request_ops = {
.request_cmd = ETHTOOL_MSG_STRSET_GET,
.reply_cmd = ETHTOOL_MSG_STRSET_GET_REPLY,
.hdr_attr = ETHTOOL_A_STRSET_HEADER,
- .max_attr = ETHTOOL_A_STRSET_MAX,
.req_info_size = sizeof(struct strset_req_info),
.reply_data_size = sizeof(struct strset_reply_data),
- .request_policy = strset_get_policy,
.allow_nodev_do = true,
.parse_request = strset_parse_request,
diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c
index 7cb5b512b77c..63b5814bd460 100644
--- a/net/ethtool/tsinfo.c
+++ b/net/ethtool/tsinfo.c
@@ -18,14 +18,9 @@ struct tsinfo_reply_data {
#define TSINFO_REPDATA(__reply_base) \
container_of(__reply_base, struct tsinfo_reply_data, base)
-static const struct nla_policy
-tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = {
- [ETHTOOL_A_TSINFO_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_TSINFO_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_TSINFO_TIMESTAMPING] = { .type = NLA_REJECT },
- [ETHTOOL_A_TSINFO_TX_TYPES] = { .type = NLA_REJECT },
- [ETHTOOL_A_TSINFO_RX_FILTERS] = { .type = NLA_REJECT },
- [ETHTOOL_A_TSINFO_PHC_INDEX] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_tsinfo_get_policy[] = {
+ [ETHTOOL_A_TSINFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int tsinfo_prepare_data(const struct ethnl_req_info *req_base,
@@ -132,10 +127,8 @@ const struct ethnl_request_ops ethnl_tsinfo_request_ops = {
.request_cmd = ETHTOOL_MSG_TSINFO_GET,
.reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY,
.hdr_attr = ETHTOOL_A_TSINFO_HEADER,
- .max_attr = ETHTOOL_A_TSINFO_MAX,
.req_info_size = sizeof(struct tsinfo_req_info),
.reply_data_size = sizeof(struct tsinfo_reply_data),
- .request_policy = tsinfo_get_policy,
.prepare_data = tsinfo_prepare_data,
.reply_size = tsinfo_reply_size,
diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c
index d93bf2da0f34..e7f2ee0d2471 100644
--- a/net/ethtool/tunnels.c
+++ b/net/ethtool/tunnels.c
@@ -8,10 +8,9 @@
#include "common.h"
#include "netlink.h"
-static const struct nla_policy
-ethtool_tunnel_info_policy[ETHTOOL_A_TUNNEL_INFO_MAX + 1] = {
- [ETHTOOL_A_TUNNEL_INFO_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_TUNNEL_INFO_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_tunnel_info_get_policy[] = {
+ [ETHTOOL_A_TUNNEL_INFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN == ilog2(UDP_TUNNEL_TYPE_VXLAN));
@@ -161,35 +160,19 @@ err_cancel_ports:
return -EMSGSIZE;
}
-static int
-ethnl_tunnel_info_req_parse(struct ethnl_req_info *req_info,
- const struct nlmsghdr *nlhdr, struct net *net,
- struct netlink_ext_ack *extack, bool require_dev)
-{
- struct nlattr *tb[ETHTOOL_A_TUNNEL_INFO_MAX + 1];
- int ret;
-
- ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_TUNNEL_INFO_MAX,
- ethtool_tunnel_info_policy, extack);
- if (ret < 0)
- return ret;
-
- return ethnl_parse_header_dev_get(req_info,
- tb[ETHTOOL_A_TUNNEL_INFO_HEADER],
- net, extack, require_dev);
-}
-
int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info)
{
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
struct sk_buff *rskb;
void *reply_payload;
int reply_len;
int ret;
- ret = ethnl_tunnel_info_req_parse(&req_info, info->nlhdr,
- genl_info_net(info), info->extack,
- true);
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_TUNNEL_INFO_HEADER],
+ genl_info_net(info), info->extack,
+ true);
if (ret < 0)
return ret;
@@ -233,16 +216,19 @@ struct ethnl_tunnel_info_dump_ctx {
int ethnl_tunnel_info_start(struct netlink_callback *cb)
{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx;
+ struct nlattr **tb = info->attrs;
int ret;
BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
memset(ctx, 0, sizeof(*ctx));
- ret = ethnl_tunnel_info_req_parse(&ctx->req_info, cb->nlh,
- sock_net(cb->skb->sk), cb->extack,
- false);
+ ret = ethnl_parse_header_dev_get(&ctx->req_info,
+ tb[ETHTOOL_A_TUNNEL_INFO_HEADER],
+ sock_net(cb->skb->sk), cb->extack,
+ false);
if (ctx->req_info.dev) {
dev_put(ctx->req_info.dev);
ctx->req_info.dev = NULL;
diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c
index 1798421e9f1c..ada7df2331d2 100644
--- a/net/ethtool/wol.c
+++ b/net/ethtool/wol.c
@@ -17,12 +17,9 @@ struct wol_reply_data {
#define WOL_REPDATA(__reply_base) \
container_of(__reply_base, struct wol_reply_data, base)
-static const struct nla_policy
-wol_get_policy[ETHTOOL_A_WOL_MAX + 1] = {
- [ETHTOOL_A_WOL_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_WOL_HEADER] = { .type = NLA_NESTED },
- [ETHTOOL_A_WOL_MODES] = { .type = NLA_REJECT },
- [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_REJECT },
+const struct nla_policy ethnl_wol_get_policy[] = {
+ [ETHTOOL_A_WOL_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
};
static int wol_prepare_data(const struct ethnl_req_info *req_base,
@@ -89,10 +86,8 @@ const struct ethnl_request_ops ethnl_wol_request_ops = {
.request_cmd = ETHTOOL_MSG_WOL_GET,
.reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY,
.hdr_attr = ETHTOOL_A_WOL_HEADER,
- .max_attr = ETHTOOL_A_WOL_MAX,
.req_info_size = sizeof(struct wol_req_info),
.reply_data_size = sizeof(struct wol_reply_data),
- .request_policy = wol_get_policy,
.prepare_data = wol_prepare_data,
.reply_size = wol_reply_size,
@@ -101,10 +96,9 @@ const struct ethnl_request_ops ethnl_wol_request_ops = {
/* WOL_SET */
-static const struct nla_policy
-wol_set_policy[ETHTOOL_A_WOL_MAX + 1] = {
- [ETHTOOL_A_WOL_UNSPEC] = { .type = NLA_REJECT },
- [ETHTOOL_A_WOL_HEADER] = { .type = NLA_NESTED },
+const struct nla_policy ethnl_wol_set_policy[] = {
+ [ETHTOOL_A_WOL_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
[ETHTOOL_A_WOL_MODES] = { .type = NLA_NESTED },
[ETHTOOL_A_WOL_SOPASS] = { .type = NLA_BINARY,
.len = SOPASS_MAX },
@@ -113,16 +107,12 @@ wol_set_policy[ETHTOOL_A_WOL_MAX + 1] = {
int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info)
{
struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
- struct nlattr *tb[ETHTOOL_A_WOL_MAX + 1];
struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
struct net_device *dev;
bool mod = false;
int ret;
- ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_WOL_MAX,
- wol_set_policy, info->extack);
- if (ret < 0)
- return ret;
ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_WOL_HEADER],
genl_info_net(info), info->extack,
true);
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
index 7e11a6c35bc3..4cfd9e829c7b 100644
--- a/net/hsr/hsr_debugfs.c
+++ b/net/hsr/hsr_debugfs.c
@@ -60,17 +60,7 @@ hsr_node_table_show(struct seq_file *sfp, void *data)
return 0;
}
-/* hsr_node_table_open - Open the node_table file
- *
- * Description:
- * This routine opens a debugfs file node_table of specific hsr
- * or prp device
- */
-static int
-hsr_node_table_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, hsr_node_table_show, inode->i_private);
-}
+DEFINE_SHOW_ATTRIBUTE(hsr_node_table);
void hsr_debugfs_rename(struct net_device *dev)
{
@@ -85,13 +75,6 @@ void hsr_debugfs_rename(struct net_device *dev)
priv->node_tbl_root = d;
}
-static const struct file_operations hsr_fops = {
- .open = hsr_node_table_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* hsr_debugfs_init - create hsr node_table file for dumping
* the node table
*
@@ -113,7 +96,7 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
de = debugfs_create_file("node_table", S_IFREG | 0444,
priv->node_tbl_root, priv,
- &hsr_fops);
+ &hsr_node_table_fops);
if (IS_ERR(de)) {
pr_err("Cannot create hsr node_table file\n");
debugfs_remove(priv->node_tbl_root);
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 7dc92ce5a134..a9c30a608e35 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -217,7 +217,10 @@ struct hsr_priv {
u8 net_id; /* for PRP, it occupies most significant 3 bits
* of lan_id
*/
- unsigned char sup_multicast_addr[ETH_ALEN];
+ unsigned char sup_multicast_addr[ETH_ALEN] __aligned(sizeof(u16));
+ /* Align to u16 boundary to avoid unaligned access
+ * in ether_addr_equal
+ */
#ifdef CONFIG_DEBUG_FS
struct dentry *node_tbl_root;
#endif
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 0e4681cf71db..f3c8f91dbe2c 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -493,7 +493,7 @@ fail:
return res;
}
-static const struct genl_ops hsr_ops[] = {
+static const struct genl_small_ops hsr_ops[] = {
{
.cmd = HSR_C_GET_NODE_STATUS,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -518,8 +518,8 @@ static struct genl_family hsr_genl_family __ro_after_init = {
.policy = hsr_genl_policy,
.netnsok = true,
.module = THIS_MODULE,
- .ops = hsr_ops,
- .n_ops = ARRAY_SIZE(hsr_ops),
+ .small_ops = hsr_ops,
+ .n_small_ops = ARRAY_SIZE(hsr_ops),
.mcgrps = hsr_mcgrps,
.n_mcgrps = ARRAY_SIZE(hsr_mcgrps),
};
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 7fe3b6b6c495..b07abc38b4b3 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -81,7 +81,7 @@ int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info)
return genlmsg_reply(msg, info);
}
-static const struct genl_ops ieee802154_ops[] = {
+static const struct genl_small_ops ieee802154_ops[] = {
/* see nl-phy.c */
IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy,
ieee802154_dump_phy),
@@ -130,8 +130,8 @@ struct genl_family nl802154_family __ro_after_init = {
.maxattr = IEEE802154_ATTR_MAX,
.policy = ieee802154_policy,
.module = THIS_MODULE,
- .ops = ieee802154_ops,
- .n_ops = ARRAY_SIZE(ieee802154_ops),
+ .small_ops = ieee802154_ops,
+ .n_small_ops = ARRAY_SIZE(ieee802154_ops),
.mcgrps = ieee802154_mcgrps,
.n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps),
};
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 6d091e419d3e..9c640d670ffe 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -149,7 +149,7 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info)
if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
char name[IFNAMSIZ + 1];
- nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME],
+ nla_strscpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME],
sizeof(name));
dev = dev_get_by_name(&init_net, name);
} else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4307503a6f0b..b94fa8eb831b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
- err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
+ err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr);
if (err)
return err;
@@ -1017,6 +1017,7 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
+ .flags = PROTO_CMSG_DATA_ONLY,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 687971d83b4e..922dd73e5740 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -125,6 +125,7 @@ static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
static void parp_redo(struct sk_buff *skb);
+static int arp_is_multicast(const void *pkey);
static const struct neigh_ops arp_generic_ops = {
.family = AF_INET,
@@ -156,6 +157,7 @@ struct neigh_table arp_tbl = {
.key_eq = arp_key_eq,
.constructor = arp_constructor,
.proxy_redo = parp_redo,
+ .is_multicast = arp_is_multicast,
.id = "arp_cache",
.parms = {
.tbl = &arp_tbl,
@@ -928,6 +930,10 @@ static void parp_redo(struct sk_buff *skb)
arp_process(dev_net(skb->dev), NULL, skb);
}
+static int arp_is_multicast(const void *pkey)
+{
+ return ipv4_is_multicast(*((__be32 *)pkey));
+}
/*
* Receive an arp request from the device layer.
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index e3939f76b024..d520e61649c8 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -28,27 +28,6 @@ static u32 unsupported_ops[] = {
static const struct btf_type *tcp_sock_type;
static u32 tcp_sock_id, sock_id;
-static int btf_sk_storage_get_ids[5];
-static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly;
-
-static int btf_sk_storage_delete_ids[5];
-static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly;
-
-static void convert_sk_func_proto(struct bpf_func_proto *to, int *to_btf_ids,
- const struct bpf_func_proto *from)
-{
- int i;
-
- *to = *from;
- to->btf_id = to_btf_ids;
- for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) {
- if (to->arg_type[i] == ARG_PTR_TO_SOCKET) {
- to->arg_type[i] = ARG_PTR_TO_BTF_ID;
- to->btf_id[i] = tcp_sock_id;
- }
- }
-}
-
static int bpf_tcp_ca_init(struct btf *btf)
{
s32 type_id;
@@ -64,13 +43,6 @@ static int bpf_tcp_ca_init(struct btf *btf)
tcp_sock_id = type_id;
tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
- convert_sk_func_proto(&btf_sk_storage_get_proto,
- btf_sk_storage_get_ids,
- &bpf_sk_storage_get_proto);
- convert_sk_func_proto(&btf_sk_storage_delete_proto,
- btf_sk_storage_delete_ids,
- &bpf_sk_storage_delete_proto);
-
return 0;
}
@@ -123,6 +95,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
}
static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf *btf,
const struct btf_type *t, int off,
int size, enum bpf_access_type atype,
u32 *next_btf_id)
@@ -130,7 +103,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
size_t end;
if (atype == BPF_READ)
- return btf_struct_access(log, t, off, size, atype, next_btf_id);
+ return btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
if (t != tcp_sock_type) {
bpf_log(log, "only read is supported\n");
@@ -185,8 +158,8 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
/* In case we want to report error later */
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &tcp_sock_id,
.arg2_type = ARG_ANYTHING,
- .btf_id = &tcp_sock_id,
};
static const struct bpf_func_proto *
@@ -197,9 +170,9 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
case BPF_FUNC_tcp_send_ack:
return &bpf_tcp_send_ack_proto;
case BPF_FUNC_sk_storage_get:
- return &btf_sk_storage_get_proto;
+ return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
- return &btf_sk_storage_delete_proto;
+ return &bpf_sk_storage_delete_proto;
default:
return bpf_base_func_proto(func_id);
}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 2eb71579f4d2..471d33a0d095 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -498,7 +498,7 @@ static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
/**
* cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
* @doi: the DOI value
- * @audit_secid: the LSM secid to use in the audit message
+ * @audit_info: NetLabel audit information
*
* Description:
* Removes a DOI definition from the CIPSO engine. The NetLabel routines will
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 123a6d39438f..75f67994fc85 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -650,8 +650,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
struct in_device *in_dev;
struct ifaddrmsg *ifm;
struct in_ifaddr *ifa;
-
- int err = -EINVAL;
+ int err;
ASSERT_RTNL();
@@ -881,7 +880,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);
if (tb[IFA_LABEL])
- nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+ nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 8b07f3a4f2db..a3271ec3e162 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -443,7 +443,6 @@ static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
u8 *tail;
- u8 *vaddr;
int nfrags;
int esph_offset;
struct page *page;
@@ -485,14 +484,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
page = pfrag->page;
get_page(page);
- vaddr = kmap_atomic(page);
-
- tail = vaddr + pfrag->offset;
+ tail = page_address(page) + pfrag->offset;
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
- kunmap_atomic(vaddr);
-
nfrags = skb_shinfo(skb)->nr_frags;
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 86a23e4a6a50..84bb707bd88d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -292,7 +292,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
.flowi4_iif = LOOPBACK_IFINDEX,
.flowi4_oif = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
- .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
+ .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK,
.flowi4_scope = scope,
.flowi4_mark = vmark ? skb->mark : 0,
};
@@ -696,7 +696,7 @@ int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
break;
case AF_INET6:
-#ifdef CONFIG_IPV6
+#if IS_ENABLED(CONFIG_IPV6)
if (alen != sizeof(struct in6_addr)) {
NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
return -EINVAL;
@@ -825,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
if (has_gw && has_via) {
NL_SET_ERR_MSG(extack,
"Nexthop configuration can not contain both GATEWAY and VIA");
- goto errout;
+ return -EINVAL;
}
return 0;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1f75dc686b6b..b5400cec4f69 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -973,7 +973,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
char tmp[TCP_CA_NAME_MAX];
bool ecn_ca = false;
- nla_strlcpy(tmp, nla, sizeof(tmp));
+ nla_strscpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
} else {
if (nla_len(nla) != sizeof(u32))
@@ -1641,9 +1641,8 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
break;
}
- *flags |= (nhc->nhc_flags & RTNH_F_ONLINK);
- if (nhc->nhc_flags & RTNH_F_OFFLOAD)
- *flags |= RTNH_F_OFFLOAD;
+ *flags |= (nhc->nhc_flags &
+ (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP));
if (!skip_oif && nhc->nhc_dev &&
nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex))
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ffc5332f1390..28117c05dc35 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2100,15 +2100,6 @@ static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
KEYLENGTH - fa->fa_slen, tb->tb_id,
info, NLM_F_REPLACE);
-
- /* call_fib_entry_notifiers will be removed when
- * in-kernel notifier is implemented and supported
- * for nexthop objects
- */
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
- n->key,
- KEYLENGTH - fa->fa_slen, fa,
- NULL);
}
}
}
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index abd083415f89..e5f69b0bf3df 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -237,7 +237,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk,
/* We can clear the encap_mark for FOU as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
- * header to the outer L3 tunnel header, or we are are simply
+ * header to the outer L3 tunnel header, or we are simply
* treating the GRE tunnel header as though it is a UDP protocol
* specific header such as VXLAN or GENEVE.
*/
@@ -429,7 +429,7 @@ next_proto:
/* We can clear the encap_mark for GUE as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
- * header to the outer L3 tunnel header, or we are are simply
+ * header to the outer L3 tunnel header, or we are simply
* treating the GRE tunnel header as though it is a UDP protocol
* specific header such as VXLAN or GENEVE.
*/
@@ -911,7 +911,7 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static const struct genl_ops fou_nl_ops[] = {
+static const struct genl_small_ops fou_nl_ops[] = {
{
.cmd = FOU_CMD_ADD,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -940,8 +940,8 @@ static struct genl_family fou_nl_family __ro_after_init = {
.policy = fou_nl_policy,
.netnsok = true,
.module = THIS_MODULE,
- .ops = fou_nl_ops,
- .n_ops = ARRAY_SIZE(fou_nl_ops),
+ .small_ops = fou_nl_ops,
+ .n_small_ops = ARRAY_SIZE(fou_nl_ops),
};
size_t fou_encap_hlen(struct ip_tunnel_encap *e)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 66fdbfe5447c..5d1e6fe9d838 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -128,7 +128,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
* to 0 and sets the configured key in the
* inner erspan header field
*/
- if (greh->protocol == htons(ETH_P_ERSPAN) ||
+ if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) ||
greh->protocol == htons(ETH_P_ERSPAN2)) {
struct erspan_base_hdr *ershdr;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cf36f955bfe6..396b492c804f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -239,7 +239,7 @@ static struct {
/**
* icmp_global_allow - Are we allowed to send one more ICMP message ?
*
- * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
+ * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
* Returns false if we reached the limit and can not send another packet.
* Note: called with BH disabled
*/
@@ -267,7 +267,10 @@ bool icmp_global_allow(void)
}
credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
if (credit) {
- credit--;
+ /* We want to use a credit of one in average, but need to randomize
+ * it for security reasons.
+ */
+ credit = max_t(int, credit - prandom_u32_max(3), 0);
rc = true;
}
WRITE_ONCE(icmp_global.credit, credit);
@@ -352,7 +355,7 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
csum = skb_copy_and_csum_bits(icmp_param->skb,
icmp_param->offset + offset,
- to, len, 0);
+ to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
if (icmp_pointers[icmp_param->data.icmph.type].error)
@@ -376,15 +379,15 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
ip_flush_pending_frames(sk);
} else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
struct icmphdr *icmph = icmp_hdr(skb);
- __wsum csum = 0;
+ __wsum csum;
struct sk_buff *skb1;
+ csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+ (char *)icmph,
+ icmp_param->head_len);
skb_queue_walk(&sk->sk_write_queue, skb1) {
csum = csum_add(csum, skb1->csum);
}
- csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
- (char *)icmph,
- icmp_param->head_len, csum);
icmph->checksum = csum_fold(csum);
skb->ip_summed = CHECKSUM_NONE;
ip_push_pending_frames(sk, fl4);
@@ -444,7 +447,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
- security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
goto out_unlock;
@@ -457,6 +460,23 @@ out_bh_enable:
local_bh_enable();
}
+/*
+ * The device used for looking up which routing table to use for sending an ICMP
+ * error is preferably the source whenever it is set, which should ensure the
+ * icmp error can be sent to the source host, else lookup using the routing
+ * table of the destination device, else use the main routing table (index 0).
+ */
+static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb)
+{
+ struct net_device *route_lookup_dev = NULL;
+
+ if (skb->dev)
+ route_lookup_dev = skb->dev;
+ else if (skb_dst(skb))
+ route_lookup_dev = skb_dst(skb)->dev;
+ return route_lookup_dev;
+}
+
static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
@@ -465,6 +485,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
int type, int code,
struct icmp_bxm *param)
{
+ struct net_device *route_lookup_dev;
struct rtable *rt, *rt2;
struct flowi4 fl4_dec;
int err;
@@ -479,9 +500,10 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
- fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
+ route_lookup_dev = icmp_get_route_lookup_dev(skb_in);
+ fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);
- security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
+ security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4));
rt = ip_route_output_key_hash(net, fl4, skb_in);
if (IS_ERR(rt))
return rt;
@@ -503,7 +525,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
if (err)
goto relookup_failed;
- if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev,
+ if (inet_addr_type_dev_table(net, route_lookup_dev,
fl4_dec.saddr) == RTN_LOCAL) {
rt2 = __ip_route_output_key(net, &fl4_dec);
if (IS_ERR(rt2))
@@ -690,9 +712,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
rcu_read_unlock();
}
- tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+ tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) |
IPTOS_PREC_INTERNETCONTROL) :
- iph->tos;
+ iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt))
@@ -784,7 +806,7 @@ EXPORT_SYMBOL(icmp_ndo_send);
static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
const struct net_protocol *ipprot;
int protocol = iph->protocol;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index b457dd2d6c75..6bd7ca09af03 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -564,7 +564,7 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+ icsk->icsk_pending = icsk->icsk_ack.pending = 0;
sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
sk_stop_timer(sk, &icsk->icsk_delack_timer);
@@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
htons(ireq->ir_num), sk->sk_uid);
- security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
@@ -640,7 +640,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
htons(ireq->ir_num), sk->sk_uid);
- security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
@@ -787,7 +787,7 @@ static void reqsk_queue_hash_req(struct request_sock *req,
timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
mod_timer(&req->rsk_timer, jiffies + timeout);
- inet_ehash_insert(req_to_sk(req), NULL);
+ inet_ehash_insert(req_to_sk(req), NULL, NULL);
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
@@ -851,6 +851,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff = 0;
newicsk->icsk_probes_out = 0;
+ newicsk->icsk_probes_tstamp = 0;
/* Deinitialize accept_queue to trap illegal accesses. */
memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index f1bd95f243b3..93474b1bea4e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -125,6 +125,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
bool net_admin)
{
const struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_sockopt inet_sockopt;
if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
goto errout;
@@ -180,6 +181,22 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = sock_i_ino(sk);
+ memset(&inet_sockopt, 0, sizeof(inet_sockopt));
+ inet_sockopt.recverr = inet->recverr;
+ inet_sockopt.is_icsk = inet->is_icsk;
+ inet_sockopt.freebind = inet->freebind;
+ inet_sockopt.hdrincl = inet->hdrincl;
+ inet_sockopt.mc_loop = inet->mc_loop;
+ inet_sockopt.transparent = inet->transparent;
+ inet_sockopt.mc_all = inet->mc_all;
+ inet_sockopt.nodefrag = inet->nodefrag;
+ inet_sockopt.bind_address_no_port = inet->bind_address_no_port;
+ inet_sockopt.recverr_rfc4884 = inet->recverr_rfc4884;
+ inet_sockopt.defer_connect = inet->defer_connect;
+ if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt),
+ &inet_sockopt))
+ goto errout;
+
return 0;
errout:
return 1;
@@ -462,8 +479,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
r->idiag_inode = 0;
if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
- inet_rsk(reqsk)->ir_mark))
+ inet_rsk(reqsk)->ir_mark)) {
+ nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
+ }
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 10d31733297d..05cd198d7a6b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -145,12 +145,16 @@ static void inet_frags_free_cb(void *ptr, void *arg)
inet_frag_destroy(fq);
}
-static void fqdir_work_fn(struct work_struct *work)
+static LLIST_HEAD(fqdir_free_list);
+
+static void fqdir_free_fn(struct work_struct *work)
{
- struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
- struct inet_frags *f = fqdir->f;
+ struct llist_node *kill_list;
+ struct fqdir *fqdir, *tmp;
+ struct inet_frags *f;
- rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
+ /* Atomically snapshot the list of fqdirs to free */
+ kill_list = llist_del_all(&fqdir_free_list);
/* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
* have completed, since they need to dereference fqdir.
@@ -158,10 +162,25 @@ static void fqdir_work_fn(struct work_struct *work)
*/
rcu_barrier();
- if (refcount_dec_and_test(&f->refcnt))
- complete(&f->completion);
+ llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) {
+ f = fqdir->f;
+ if (refcount_dec_and_test(&f->refcnt))
+ complete(&f->completion);
- kfree(fqdir);
+ kfree(fqdir);
+ }
+}
+
+static DECLARE_WORK(fqdir_free_work, fqdir_free_fn);
+
+static void fqdir_work_fn(struct work_struct *work)
+{
+ struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
+
+ rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
+
+ if (llist_add(&fqdir->free_list, &fqdir_free_list))
+ queue_work(system_wq, &fqdir_free_work);
}
int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
@@ -184,10 +203,22 @@ int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
}
EXPORT_SYMBOL(fqdir_init);
+static struct workqueue_struct *inet_frag_wq;
+
+static int __init inet_frag_wq_init(void)
+{
+ inet_frag_wq = create_workqueue("inet_frag_wq");
+ if (!inet_frag_wq)
+ panic("Could not create inet frag workq");
+ return 0;
+}
+
+pure_initcall(inet_frag_wq_init);
+
void fqdir_exit(struct fqdir *fqdir)
{
INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
- queue_work(system_wq, &fqdir->destroy_work);
+ queue_work(inet_frag_wq, &fqdir->destroy_work);
}
EXPORT_SYMBOL(fqdir_exit);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 239e54474b65..45fb450b4522 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -20,6 +20,9 @@
#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/inet6_hashtables.h>
+#endif
#include <net/secure_seq.h>
#include <net/ip.h>
#include <net/tcp.h>
@@ -228,7 +231,7 @@ static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
- const int dif, const int sdif, bool exact_dif)
+ const int dif, const int sdif)
{
int score = -1;
@@ -277,15 +280,13 @@ static struct sock *inet_lhash2_lookup(struct net *net,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
- bool exact_dif = inet_exact_dif_match(net, skb);
struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
int score, hiscore = 0;
inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
sk = (struct sock *)icsk;
- score = compute_score(sk, net, hnum, daddr,
- dif, sdif, exact_dif);
+ score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
result = lookup_reuseport(net, sk, skb, doff,
saddr, sport, daddr, hnum);
@@ -510,10 +511,52 @@ static u32 inet_sk_port_offset(const struct sock *sk)
inet->inet_dport);
}
-/* insert a socket into ehash, and eventually remove another one
- * (The another one can be a SYN_RECV or TIMEWAIT
+/* Searches for an exsiting socket in the ehash bucket list.
+ * Returns true if found, false otherwise.
+ */
+static bool inet_ehash_lookup_by_sk(struct sock *sk,
+ struct hlist_nulls_head *list)
+{
+ const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
+ const int sdif = sk->sk_bound_dev_if;
+ const int dif = sk->sk_bound_dev_if;
+ const struct hlist_nulls_node *node;
+ struct net *net = sock_net(sk);
+ struct sock *esk;
+
+ INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
+
+ sk_nulls_for_each_rcu(esk, node, list) {
+ if (esk->sk_hash != sk->sk_hash)
+ continue;
+ if (sk->sk_family == AF_INET) {
+ if (unlikely(INET_MATCH(esk, net, acookie,
+ sk->sk_daddr,
+ sk->sk_rcv_saddr,
+ ports, dif, sdif))) {
+ return true;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ if (unlikely(INET6_MATCH(esk, net,
+ &sk->sk_v6_daddr,
+ &sk->sk_v6_rcv_saddr,
+ ports, dif, sdif))) {
+ return true;
+ }
+ }
+#endif
+ }
+ return false;
+}
+
+/* Insert a socket into ehash, and eventually remove another one
+ * (The another one can be a SYN_RECV or TIMEWAIT)
+ * If an existing socket already exists, socket sk is not inserted,
+ * and sets found_dup_sk parameter to true.
*/
-bool inet_ehash_insert(struct sock *sk, struct sock *osk)
+bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
@@ -532,16 +575,23 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
if (osk) {
WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
ret = sk_nulls_del_node_init_rcu(osk);
+ } else if (found_dup_sk) {
+ *found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
+ if (*found_dup_sk)
+ ret = false;
}
+
if (ret)
__sk_nulls_add_node_rcu(sk, list);
+
spin_unlock(lock);
+
return ret;
}
-bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{
- bool ok = inet_ehash_insert(sk, osk);
+ bool ok = inet_ehash_insert(sk, osk, found_dup_sk);
if (ok) {
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -585,7 +635,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
int err = 0;
if (sk->sk_state != TCP_LISTEN) {
- inet_ehash_nolisten(sk, osk);
+ inet_ehash_nolisten(sk, osk, NULL);
return 0;
}
WARN_ON(!sk_unhashed(sk));
@@ -681,7 +731,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- inet_ehash_nolisten(sk, NULL);
+ inet_ehash_nolisten(sk, NULL, NULL);
spin_unlock_bh(&head->lock);
return 0;
}
@@ -760,7 +810,7 @@ ok:
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
- inet_ehash_nolisten(sk, (struct sock *)tw);
+ inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
}
if (tw)
inet_twsk_bind_unhash(tw, hinfo);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c411c87ae865..437afe392e66 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -272,14 +272,14 @@ restart:
continue;
tw = inet_twsk(sk);
if ((tw->tw_family != family) ||
- refcount_read(&twsk_net(tw)->count))
+ refcount_read(&twsk_net(tw)->ns.count))
continue;
if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt)))
continue;
if (unlikely((tw->tw_family != family) ||
- refcount_read(&twsk_net(tw)->count))) {
+ refcount_read(&twsk_net(tw)->ns.count))) {
inet_twsk_put(tw);
goto restart;
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 4e31f23e4117..a68bf4c6fe9b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -625,9 +625,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
}
if (dev->header_ops) {
- /* Need space for new headers */
- if (skb_cow_head(skb, dev->needed_headroom -
- (tunnel->hlen + sizeof(struct iphdr))))
+ if (skb_cow_head(skb, 0))
goto free_skb;
tnl_params = (const struct iphdr *)skb->data;
@@ -748,7 +746,11 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
len = tunnel->tun_hlen - len;
tunnel->hlen = tunnel->hlen + len;
- dev->needed_headroom = dev->needed_headroom + len;
+ if (dev->header_ops)
+ dev->hard_header_len += len;
+ else
+ dev->needed_headroom += len;
+
if (set_mtu)
dev->mtu = max_t(int, dev->mtu - len, 68);
@@ -918,7 +920,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
.ndo_start_xmit = ipgre_xmit,
.ndo_do_ioctl = ip_tunnel_ioctl,
.ndo_change_mtu = ip_tunnel_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
.ndo_tunnel_ctl = ipgre_tunnel_ctl,
};
@@ -944,6 +946,7 @@ static void __gre_tunnel_init(struct net_device *dev)
tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+ dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
@@ -987,10 +990,14 @@ static int ipgre_tunnel_init(struct net_device *dev)
return -EINVAL;
dev->flags = IFF_BROADCAST;
dev->header_ops = &ipgre_header_ops;
+ dev->hard_header_len = tunnel->hlen + sizeof(*iph);
+ dev->needed_headroom = 0;
}
#endif
} else if (!tunnel->collect_md) {
dev->header_ops = &ipgre_header_ops;
+ dev->hard_header_len = tunnel->hlen + sizeof(*iph);
+ dev->needed_headroom = 0;
}
return ip_tunnel_init(dev);
@@ -1268,7 +1275,7 @@ static const struct net_device_ops gre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip_tunnel_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
.ndo_fill_metadata_dst = gre_fill_metadata_dst,
};
@@ -1301,7 +1308,7 @@ static const struct net_device_ops erspan_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip_tunnel_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
.ndo_fill_metadata_dst = gre_fill_metadata_dst,
};
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 948747aac4e2..da1b5038bdfd 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -47,32 +47,32 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
unsigned char *iph = skb_network_header(skb);
memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
- memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+ memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen);
opt = &(IPCB(skb)->opt);
if (opt->srr)
- memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+ memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4);
if (!is_frag) {
if (opt->rr_needaddr)
- ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
+ ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
if (opt->ts_needaddr)
- ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
+ ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
if (opt->ts_needtime) {
__be32 midtime;
midtime = inet_current_timestamp();
- memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+ memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
}
return;
}
if (opt->rr) {
- memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+ memset(iph + opt->rr, IPOPT_NOP, iph[opt->rr + 1]);
opt->rr = 0;
opt->rr_needaddr = 0;
}
if (opt->ts) {
- memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+ memset(iph + opt->ts, IPOPT_NOP, iph[opt->ts + 1]);
opt->ts = 0;
opt->ts_needaddr = opt->ts_needtime = 0;
}
@@ -495,26 +495,29 @@ EXPORT_SYMBOL(ip_options_compile);
void ip_options_undo(struct ip_options *opt)
{
if (opt->srr) {
- unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
- memmove(optptr+7, optptr+3, optptr[1]-7);
- memcpy(optptr+3, &opt->faddr, 4);
+ unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr);
+
+ memmove(optptr + 7, optptr + 3, optptr[1] - 7);
+ memcpy(optptr + 3, &opt->faddr, 4);
}
if (opt->rr_needaddr) {
- unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr);
+
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
+ memset(&optptr[optptr[2] - 1], 0, 4);
}
if (opt->ts) {
- unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr);
+
if (opt->ts_needtime) {
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
- if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+ memset(&optptr[optptr[2] - 1], 0, 4);
+ if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC)
optptr[2] -= 4;
}
if (opt->ts_needaddr) {
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
+ memset(&optptr[optptr[2] - 1], 0, 4);
}
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e6f2ada9e7d5..2ed0b01f72f0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -143,7 +143,8 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
*
*/
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
- __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
+ __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
+ u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
@@ -156,7 +157,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- iph->tos = inet->tos;
+ iph->tos = tos;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
@@ -301,7 +302,7 @@ static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
- if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
+ if (skb->len > mtu || IPCB(skb)->frag_max_size)
return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
return ip_finish_output2(net, sk, skb);
@@ -997,7 +998,7 @@ static int __ip_append_data(struct sock *sk,
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
- maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;
if (cork->length + length > maxnonfragsize - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1127,7 +1128,7 @@ alloc_new_skb:
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
- data + transhdrlen, fraggap, 0);
+ data + transhdrlen, fraggap);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
data += fraggap;
@@ -1352,7 +1353,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
if (cork->flags & IPCORK_OPT)
opt = cork->opt;
- if (!(rt->dst.dev->features&NETIF_F_SG))
+ if (!(rt->dst.dev->features & NETIF_F_SG))
return -EOPNOTSUPP;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1412,7 +1413,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
skb->csum = skb_copy_and_csum_bits(skb_prev,
maxfraglen,
skb_transport_header(skb),
- fraggap, 0);
+ fraggap);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
pskb_trim_unique(skb_prev, maxfraglen);
@@ -1537,7 +1538,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
ip_select_ident(net, skb, sk);
if (opt) {
- iph->ihl += opt->optlen>>2;
+ iph->ihl += opt->optlen >> 2;
ip_options_build(skb, opt, cork->addr, rt, 0);
}
@@ -1649,7 +1650,7 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
{
__wsum csum;
- csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+ csum = csum_partial_copy_nocheck(dptr+offset, to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
return 0;
}
@@ -1699,7 +1700,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
daddr, saddr,
tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
arg->uid);
- security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
return;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index d2c223554ff7..ec6036713e2c 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1124,8 +1124,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
dev_put(dev);
err = -EINVAL;
- if (sk->sk_bound_dev_if &&
- (!midx || midx != sk->sk_bound_dev_if))
+ if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if)
break;
inet->uc_index = ifindex;
@@ -1189,7 +1188,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
err = -EINVAL;
if (sk->sk_bound_dev_if &&
mreq.imr_ifindex != sk->sk_bound_dev_if &&
- (!midx || midx != sk->sk_bound_dev_if))
+ midx != sk->sk_bound_dev_if)
break;
inet->mc_index = mreq.imr_ifindex;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 0c1f36404471..76a420c76f16 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -317,7 +317,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
}
dev->needed_headroom = t_hlen + hlen;
- mtu -= (dev->hard_header_len + t_hlen);
+ mtu -= t_hlen;
if (mtu < IPV4_MIN_MTU)
mtu = IPV4_MIN_MTU;
@@ -347,7 +347,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
nt = netdev_priv(dev);
t_hlen = nt->hlen + sizeof(struct iphdr);
dev->min_mtu = ETH_MIN_MTU;
- dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
+ dev->max_mtu = IP_MAX_MTU - t_hlen;
ip_tunnel_add(itn, nt);
return nt;
@@ -360,7 +360,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
bool log_ecn_error)
{
- struct pcpu_sw_netstats *tstats;
const struct iphdr *iph = ip_hdr(skb);
int err;
@@ -402,12 +401,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
}
}
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
-
+ dev_sw_netstats_rx_add(tunnel->dev, skb->len);
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
if (tunnel->dev->type == ARPHRD_ETHER) {
@@ -494,11 +488,10 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
int mtu;
tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
- pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
+ pkt_size = skb->len - tunnel_hlen;
if (df)
- mtu = dst_mtu(&rt->dst) - dev->hard_header_len
- - sizeof(struct iphdr) - tunnel_hlen;
+ mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
else
mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
@@ -614,9 +607,6 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
ttl = ip4_dst_hoplimit(&rt->dst);
}
- if (!df && skb->protocol == htons(ETH_P_IP))
- df = inner_iph->frag_off & htons(IP_DF);
-
headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
if (headroom > dev->needed_headroom)
dev->needed_headroom = headroom;
@@ -768,8 +758,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
- if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
- 0, 0, false)) {
+ df = tnl_params->frag_off;
+ if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
+ df |= (inner_iph->frag_off & htons(IP_DF));
+
+ if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
ip_rt_put(rt);
goto tx_error;
}
@@ -797,10 +790,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
ttl = ip4_dst_hoplimit(&rt->dst);
}
- df = tnl_params->frag_off;
- if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
- df |= (inner_iph->frag_off&htons(IP_DF));
-
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
if (max_headroom > dev->needed_headroom)
@@ -982,7 +971,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
- int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
+ int max_mtu = IP_MAX_MTU - t_hlen;
if (new_mtu < ETH_MIN_MTU)
return -EINVAL;
@@ -1159,10 +1148,9 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
mtu = ip_tunnel_bind_dev(dev);
if (tb[IFLA_MTU]) {
- unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
+ unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
- mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
- (unsigned int)(max - sizeof(struct iphdr)));
+ mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
}
err = dev_set_mtu(dev, mtu);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index b2ea1a8c5fd6..7ca338fbe8ba 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -263,7 +263,7 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
const struct icmphdr *icmph = icmp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
- if (mtu <= 576 || iph->frag_off != htons(IP_DF))
+ if (mtu < 576 || iph->frag_off != htons(IP_DF))
return 0;
if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) ||
@@ -359,7 +359,7 @@ static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
__be16 frag_off;
int offset;
- if (mtu <= IPV6_MIN_MTU)
+ if (mtu < IPV6_MIN_MTU)
return 0;
if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
@@ -429,36 +429,6 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
}
EXPORT_SYMBOL(skb_tunnel_check_pmtu);
-/* Often modified stats are per cpu, other are shared (netdev->stats) */
-void ip_tunnel_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
-{
- int i;
-
- netdev_stats_to_stats64(tot, &dev->stats);
-
- for_each_possible_cpu(i) {
- const struct pcpu_sw_netstats *tstats =
- per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin_irq(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
-
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
- }
-}
-EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
-
static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
[LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS },
[LWTUNNEL_IP_ID] = { .type = NLA_U64 },
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index f687abb069fa..abc171e79d3e 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -95,7 +95,6 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
{
unsigned short family;
struct net_device *dev;
- struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
const struct xfrm_mode *inner_mode;
struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
@@ -138,13 +137,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
skb->dev = dev;
-
- tstats = this_cpu_ptr(dev->tstats);
-
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_rx_add(dev, skb->len);
return 0;
}
@@ -411,7 +404,7 @@ static const struct net_device_ops vti_netdev_ops = {
.ndo_start_xmit = vti_tunnel_xmit,
.ndo_do_ioctl = ip_tunnel_ioctl,
.ndo_change_mtu = ip_tunnel_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
.ndo_tunnel_ctl = vti_tunnel_ctl,
};
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 561f15b5a944..3cd13e1bc6a7 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1441,7 +1441,7 @@ static int __init ip_auto_config(void)
int retries = CONF_OPEN_RETRIES;
#endif
int err;
- unsigned int i;
+ unsigned int i, count;
/* Initialise all name servers and NTP servers to NONE (but only if the
* "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded,
@@ -1575,7 +1575,7 @@ static int __init ip_auto_config(void)
if (ic_dev_mtu)
pr_cont(", mtu=%d", ic_dev_mtu);
/* Name servers (if any): */
- for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
+ for (i = 0, count = 0; i < CONF_NAMESERVERS_MAX; i++) {
if (ic_nameservers[i] != NONE) {
if (i == 0)
pr_info(" nameserver%u=%pI4",
@@ -1583,12 +1583,14 @@ static int __init ip_auto_config(void)
else
pr_cont(", nameserver%u=%pI4",
i, &ic_nameservers[i]);
+
+ count++;
}
- if (i + 1 == CONF_NAMESERVERS_MAX)
+ if ((i + 1 == CONF_NAMESERVERS_MAX) && count > 0)
pr_cont("\n");
}
/* NTP servers (if any): */
- for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+ for (i = 0, count = 0; i < CONF_NTP_SERVERS_MAX; i++) {
if (ic_ntp_servers[i] != NONE) {
if (i == 0)
pr_info(" ntpserver%u=%pI4",
@@ -1596,8 +1598,10 @@ static int __init ip_auto_config(void)
else
pr_cont(", ntpserver%u=%pI4",
i, &ic_ntp_servers[i]);
+
+ count++;
}
- if (i + 1 == CONF_NTP_SERVERS_MAX)
+ if ((i + 1 == CONF_NTP_SERVERS_MAX) && count > 0)
pr_cont("\n");
}
#endif /* !SILENT */
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 75d35e76bec2..d5bfa087c23a 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -347,7 +347,7 @@ static const struct net_device_ops ipip_netdev_ops = {
.ndo_start_xmit = ipip_tunnel_xmit,
.ndo_do_ioctl = ip_tunnel_ioctl,
.ndo_change_mtu = ip_tunnel_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
.ndo_tunnel_ctl = ipip_tunnel_ctl,
};
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 876fd6ff1ff9..939792a38814 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1038,10 +1038,13 @@ static int ipmr_cache_report(struct mr_table *mrt,
memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
msg->im_msgtype = assert;
msg->im_mbz = 0;
- if (assert == IGMPMSG_WRVIFWHOLE)
+ if (assert == IGMPMSG_WRVIFWHOLE) {
msg->im_vif = vifi;
- else
+ msg->im_vif_hi = vifi >> 8;
+ } else {
msg->im_vif = mrt->mroute_reg_vif_num;
+ msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8;
+ }
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
sizeof(struct iphdr));
@@ -1054,6 +1057,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
ip_hdr(skb)->protocol = 0;
msg = (struct igmpmsg *)skb_network_header(skb);
msg->im_vif = vifi;
+ msg->im_vif_hi = vifi >> 8;
skb_dst_set(skb, dst_clone(skb_dst(pkt)));
/* Add our header */
igmp = skb_put(skb, sizeof(struct igmphdr));
@@ -2396,6 +2400,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
+ nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */
+ nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */
+ nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */
+ + nla_total_size(4) /* IPMRA_CREPORT_TABLE */
/* IPMRA_CREPORT_PKT */
+ nla_total_size(payloadlen)
;
@@ -2427,11 +2432,12 @@ static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
rtgenm = nlmsg_data(nlh);
rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
- nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) ||
+ nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) ||
nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
msg->im_src.s_addr) ||
nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
- msg->im_dst.s_addr))
+ msg->im_dst.s_addr) ||
+ nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id))
goto nla_put_failure;
nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 3205d5f7c8c9..25ea6ac44db9 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -31,7 +31,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
- nla_strlcpy(tmp, nla, sizeof(tmp));
+ nla_strscpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC) {
NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index a058213b77a7..7c841037c533 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -17,17 +17,19 @@
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type)
+int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct flowi4 fl4 = {};
__be32 saddr = iph->saddr;
- const struct sock *sk = skb_to_full_sk(skb);
- __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0;
+ __u8 flags;
struct net_device *dev = skb_dst(skb)->dev;
unsigned int hh_len;
+ sk = sk_to_full_sk(sk);
+ flags = sk ? inet_sk_flowi_flags(sk) : 0;
+
if (addr_type == RTN_UNSPEC)
addr_type = inet_addr_type_dev_table(net, dev, saddr);
if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index d1e04d2b5170..c576a63d09db 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -203,7 +203,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
local_bh_disable();
addend = xt_write_recseq_begin();
- private = READ_ONCE(table->private); /* Address dependency. */
+ private = rcu_access_pointer(table->private);
cpu = smp_processor_id();
table_base = private->entries;
jumpstack = (struct arpt_entry **)private->jumpstack[cpu];
@@ -649,7 +649,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(table);
/* We need atomic snapshot of counters: rest doesn't change
* (other than comefrom, which userspace doesn't care
@@ -673,7 +673,7 @@ static int copy_entries_to_user(unsigned int total_size,
unsigned int off, num;
const struct arpt_entry *e;
struct xt_counters *counters;
- struct xt_table_info *private = table->private;
+ struct xt_table_info *private = xt_table_get_private_protected(table);
int ret = 0;
void *loc_cpu_entry;
@@ -807,7 +807,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
if (!IS_ERR(t)) {
struct arpt_getinfo info;
- const struct xt_table_info *private = t->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(t);
#ifdef CONFIG_COMPAT
struct xt_table_info tmp;
@@ -860,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
if (!IS_ERR(t)) {
- const struct xt_table_info *private = t->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(t);
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
@@ -1017,7 +1017,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
}
local_bh_disable();
- private = t->private;
+ private = xt_table_get_private_protected(t);
if (private->number != tmp.num_counters) {
ret = -EINVAL;
goto unlock_up_free;
@@ -1330,7 +1330,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
void __user *userptr)
{
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(table);
void __user *pos;
unsigned int size;
int ret = 0;
@@ -1379,7 +1379,7 @@ static int compat_get_entries(struct net *net,
xt_compat_lock(NFPROTO_ARP);
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
if (!IS_ERR(t)) {
- const struct xt_table_info *private = t->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(t);
struct xt_table_info info;
ret = compat_table_info(private, &info);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index f15bc21d7301..e8f6f9d86237 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -258,7 +258,7 @@ ipt_do_table(struct sk_buff *skb,
WARN_ON(!(table->valid_hooks & (1 << hook)));
local_bh_disable();
addend = xt_write_recseq_begin();
- private = READ_ONCE(table->private); /* Address dependency. */
+ private = rcu_access_pointer(table->private);
cpu = smp_processor_id();
table_base = private->entries;
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
@@ -791,7 +791,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(table);
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
@@ -815,7 +815,7 @@ copy_entries_to_user(unsigned int total_size,
unsigned int off, num;
const struct ipt_entry *e;
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(table);
int ret = 0;
const void *loc_cpu_entry;
@@ -964,7 +964,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
t = xt_request_find_table_lock(net, AF_INET, name);
if (!IS_ERR(t)) {
struct ipt_getinfo info;
- const struct xt_table_info *private = t->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(t);
#ifdef CONFIG_COMPAT
struct xt_table_info tmp;
@@ -1018,7 +1018,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
t = xt_find_table_lock(net, AF_INET, get.name);
if (!IS_ERR(t)) {
- const struct xt_table_info *private = t->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(t);
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
@@ -1173,7 +1173,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
}
local_bh_disable();
- private = t->private;
+ private = xt_table_get_private_protected(t);
if (private->number != tmp.num_counters) {
ret = -EINVAL;
goto unlock_up_free;
@@ -1543,7 +1543,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
void __user *userptr)
{
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(table);
void __user *pos;
unsigned int size;
int ret = 0;
@@ -1589,7 +1589,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
xt_compat_lock(AF_INET);
t = xt_find_table_lock(net, AF_INET, get.name);
if (!IS_ERR(t)) {
- const struct xt_table_info *private = t->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(t);
struct xt_table_info info;
ret = compat_table_info(private, &info);
if (!ret && get.size == info.size)
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index e16b98ee6266..4b8840734762 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -56,7 +56,8 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(xt_net(par), skb, hook);
+ nf_send_reset(xt_net(par), par->state->sk, skb, hook);
+ break;
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index cc23f1ce239c..8cd3224d913e 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
- flow.flowi4_tos = RT_TOS(iph->tos);
+ flow.flowi4_tos = iph->tos & IPTOS_RT_MASK;
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par));
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index f703a717ab1d..833079589273 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -62,7 +62,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index 7a83f881efa9..136030ad2e54 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -43,16 +43,31 @@ static void dump_arp_packet(struct nf_log_buf *m,
const struct nf_loginfo *info,
const struct sk_buff *skb, unsigned int nhoff)
{
- const struct arphdr *ah;
- struct arphdr _arph;
const struct arppayload *ap;
struct arppayload _arpp;
+ const struct arphdr *ah;
+ unsigned int logflags;
+ struct arphdr _arph;
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
if (ah == NULL) {
nf_log_buf_add(m, "TRUNCATED");
return;
}
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_DEFAULT_MASK;
+
+ if (logflags & NF_LOG_MACDECODE) {
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+ nf_log_dump_vlan(m, skb);
+ nf_log_buf_add(m, "MACPROTO=%04x ",
+ ntohs(eth_hdr(skb)->h_proto));
+ }
+
nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 0c72156130b6..d07583fac8f8 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -284,8 +284,10 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m,
switch (dev->type) {
case ARPHRD_ETHER:
- nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
- eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+ nf_log_dump_vlan(m, skb);
+ nf_log_buf_add(m, "MACPROTO=%04x ",
ntohs(eth_hdr(skb)->h_proto));
return;
default:
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 9dcfa4e461b6..4eed5afca392 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -12,6 +12,128 @@
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
+static int nf_reject_iphdr_validate(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ u32 len;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 || iph->version != 4)
+ return 0;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len)
+ return 0;
+ else if (len < (iph->ihl*4))
+ return 0;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ return 0;
+
+ return 1;
+}
+
+struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ const struct tcphdr *oth;
+ struct sk_buff *nskb;
+ struct iphdr *niph;
+ struct tcphdr _oth;
+
+ if (!nf_reject_iphdr_validate(oldskb))
+ return NULL;
+
+ oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+ if (!oth)
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+ net->ipv4.sysctl_ip_default_ttl);
+ nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset);
+
+struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+ struct iphdr *niph;
+ struct icmphdr *icmph;
+ unsigned int len;
+ __wsum csum;
+ u8 proto;
+
+ if (!nf_reject_iphdr_validate(oldskb))
+ return NULL;
+
+ /* IP header checks: fragment. */
+ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+ return NULL;
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+ len = min_t(unsigned int, 536, oldskb->len);
+
+ if (!pskb_may_pull(oldskb, len))
+ return NULL;
+
+ if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len)))
+ return NULL;
+
+ proto = ip_hdr(oldskb)->protocol;
+
+ if (!skb_csum_unnecessary(oldskb) &&
+ nf_reject_verify_csum(proto) &&
+ nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto))
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+ LL_MAX_HEADER + len, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
+ net->ipv4.sysctl_ip_default_ttl);
+
+ skb_reset_transport_header(nskb);
+ icmph = skb_put_zero(nskb, sizeof(struct icmphdr));
+ icmph->type = ICMP_DEST_UNREACH;
+ icmph->code = code;
+
+ skb_put_data(nskb, skb_network_header(oldskb), len);
+
+ csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0);
+ icmph->checksum = csum_fold(csum);
+
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach);
+
const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
struct tcphdr *_oth, int hook)
{
@@ -112,7 +234,8 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
}
/* Send RST reply */
-void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
+void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
+ int hook)
{
struct net_device *br_indev __maybe_unused;
struct sk_buff *nskb;
@@ -124,7 +247,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
if (!oth)
return;
- if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(oldskb))
+ if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) &&
+ nf_reject_fill_skb_dst(oldskb) < 0)
return;
if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -144,8 +268,7 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
ip4_dst_hoplimit(skb_dst(nskb)));
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
-
- if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC))
goto free_nskb;
niph = ip_hdr(nskb);
@@ -193,7 +316,8 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
if (iph->frag_off & htons(IP_OFFSET))
return;
- if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(skb_in))
+ if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) &&
+ nf_reject_fill_skb_dst(skb_in) < 0)
return;
if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) {
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index e408f813f5d8..ff437e4ed6db 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -27,7 +27,8 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset(nft_net(pkt), pkt->xt.state->sk, pkt->skb,
+ nft_hook(pkt));
break;
default:
break;
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 134e92382275..e53e43aef785 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -36,14 +36,145 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
[NHA_FDB] = { .type = NLA_FLAG },
};
+static bool nexthop_notifiers_is_empty(struct net *net)
+{
+ return !net->nexthop.notifier_chain.head;
+}
+
+static void
+__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
+ const struct nexthop *nh)
+{
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+ nh_info->dev = nhi->fib_nhc.nhc_dev;
+ nh_info->gw_family = nhi->fib_nhc.nhc_gw_family;
+ if (nh_info->gw_family == AF_INET)
+ nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4;
+ else if (nh_info->gw_family == AF_INET6)
+ nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6;
+
+ nh_info->is_reject = nhi->reject_nh;
+ nh_info->is_fdb = nhi->fdb_nh;
+ nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate;
+}
+
+static int nh_notifier_single_info_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
+ if (!info->nh)
+ return -ENOMEM;
+
+ __nh_notifier_single_info_init(info->nh, nh);
+
+ return 0;
+}
+
+static void nh_notifier_single_info_fini(struct nh_notifier_info *info)
+{
+ kfree(info->nh);
+}
+
+static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ u16 num_nh = nhg->num_nh;
+ int i;
+
+ info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh),
+ GFP_KERNEL);
+ if (!info->nh_grp)
+ return -ENOMEM;
+
+ info->nh_grp->num_nh = num_nh;
+ info->nh_grp->is_fdb = nhg->fdb_nh;
+
+ for (i = 0; i < num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ info->nh_grp->nh_entries[i].id = nhge->nh->id;
+ info->nh_grp->nh_entries[i].weight = nhge->weight;
+ __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
+ nhge->nh);
+ }
+
+ return 0;
+}
+
+static void nh_notifier_grp_info_fini(struct nh_notifier_info *info)
+{
+ kfree(info->nh_grp);
+}
+
+static int nh_notifier_info_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ info->id = nh->id;
+ info->is_grp = nh->is_group;
+
+ if (info->is_grp)
+ return nh_notifier_grp_info_init(info, nh);
+ else
+ return nh_notifier_single_info_init(info, nh);
+}
+
+static void nh_notifier_info_fini(struct nh_notifier_info *info)
+{
+ if (info->is_grp)
+ nh_notifier_grp_info_fini(info);
+ else
+ nh_notifier_single_info_fini(info);
+}
+
static int call_nexthop_notifiers(struct net *net,
enum nexthop_event_type event_type,
- struct nexthop *nh)
+ struct nexthop *nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ };
+ int err;
+
+ ASSERT_RTNL();
+
+ if (nexthop_notifiers_is_empty(net))
+ return 0;
+
+ err = nh_notifier_info_init(&info, nh);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
+ return err;
+ }
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ event_type, &info);
+ nh_notifier_info_fini(&info);
+
+ return notifier_to_errno(err);
+}
+
+static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
+ enum nexthop_event_type event_type,
+ struct nexthop *nh,
+ struct netlink_ext_ack *extack)
{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ };
int err;
- err = atomic_notifier_call_chain(&net->nexthop.notifier_chain,
- event_type, nh);
+ err = nh_notifier_info_init(&info, nh);
+ if (err)
+ return err;
+
+ err = nb->notifier_call(nb, event_type, &info);
+ nh_notifier_info_fini(&info);
+
return notifier_to_errno(err);
}
@@ -133,12 +264,9 @@ static struct nexthop *nexthop_alloc(void)
static struct nh_group *nexthop_grp_alloc(u16 num_nh)
{
- size_t sz = offsetof(struct nexthop, nh_grp)
- + sizeof(struct nh_group)
- + sizeof(struct nh_grp_entry) * num_nh;
struct nh_group *nhg;
- nhg = kzalloc(sz, GFP_KERNEL);
+ nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
if (nhg)
nhg->num_nh = num_nh;
@@ -279,7 +407,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
case AF_INET:
fib_nh = &nhi->fib_nh;
if (fib_nh->fib_nh_gw_family &&
- nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
+ nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
goto nla_put_failure;
break;
@@ -499,7 +627,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) {
if (!tb[i])
continue;
- if (tb[NHA_FDB])
+ if (i == NHA_FDB)
continue;
NL_SET_ERR_MSG(extack,
"No other attributes can be set in nexthop groups");
@@ -785,9 +913,10 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
{
struct nh_grp_entry *nhges, *new_nhges;
struct nexthop *nhp = nhge->nh_parent;
+ struct netlink_ext_ack extack;
struct nexthop *nh = nhge->nh;
struct nh_group *nhg, *newg;
- int i, j;
+ int i, j, err;
WARN_ON(!nh);
@@ -800,7 +929,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
return;
}
- newg->has_v4 = nhg->has_v4;
+ newg->has_v4 = false;
newg->mpath = nhg->mpath;
newg->fdb_nh = nhg->fdb_nh;
newg->num_nh = nhg->num_nh;
@@ -809,12 +938,18 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
nhges = nhg->nh_entries;
new_nhges = newg->nh_entries;
for (i = 0, j = 0; i < nhg->num_nh; ++i) {
+ struct nh_info *nhi;
+
/* current nexthop getting removed */
if (nhg->nh_entries[i].nh == nh) {
newg->num_nh--;
continue;
}
+ nhi = rtnl_dereference(nhges[i].nh->nh_info);
+ if (nhi->family == AF_INET)
+ newg->has_v4 = true;
+
list_del(&nhges[i].nh_list);
new_nhges[j].nh_parent = nhges[i].nh_parent;
new_nhges[j].nh = nhges[i].nh;
@@ -829,6 +964,10 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
list_del(&nhge->nh_list);
nexthop_put(nhge->nh);
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack);
+ if (err)
+ pr_err("%s\n", extack._msg);
+
if (nlinfo)
nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
}
@@ -842,7 +981,7 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
remove_nh_grp_entry(net, nhge, nlinfo);
/* make sure all see the newly published array before releasing rtnl */
- synchronize_rcu();
+ synchronize_net();
}
static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
@@ -867,8 +1006,6 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
bool do_flush = false;
struct fib_info *fi;
- call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh);
-
list_for_each_entry(fi, &nh->fi_list, nh_list) {
fi->fib_flags |= RTNH_F_DEAD;
do_flush = true;
@@ -906,6 +1043,8 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh,
static void remove_nexthop(struct net *net, struct nexthop *nh,
struct nl_info *nlinfo)
{
+ call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);
+
/* remove from the tree */
rb_erase(&nh->rb_node, &net->nexthop.rb_root);
@@ -937,13 +1076,17 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old,
struct netlink_ext_ack *extack)
{
struct nh_group *oldg, *newg;
- int i;
+ int i, err;
if (!new->is_group) {
NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
return -EINVAL;
}
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
+ if (err)
+ return err;
+
oldg = rtnl_dereference(old->nh_grp);
newg = rtnl_dereference(new->nh_grp);
@@ -961,30 +1104,100 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old,
return 0;
}
+static void nh_group_v4_update(struct nh_group *nhg)
+{
+ struct nh_grp_entry *nhges;
+ bool has_v4 = false;
+ int i;
+
+ nhges = nhg->nh_entries;
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nhges[i].nh->nh_info);
+ if (nhi->family == AF_INET)
+ has_v4 = true;
+ }
+ nhg->has_v4 = has_v4;
+}
+
static int replace_nexthop_single(struct net *net, struct nexthop *old,
struct nexthop *new,
struct netlink_ext_ack *extack)
{
+ u8 old_protocol, old_nh_flags;
struct nh_info *oldi, *newi;
+ struct nh_grp_entry *nhge;
+ int err;
if (new->is_group) {
NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
return -EINVAL;
}
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
+ if (err)
+ return err;
+
+ /* Hardware flags were set on 'old' as 'new' is not in the red-black
+ * tree. Therefore, inherit the flags from 'old' to 'new'.
+ */
+ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP);
+
oldi = rtnl_dereference(old->nh_info);
newi = rtnl_dereference(new->nh_info);
newi->nh_parent = old;
oldi->nh_parent = new;
+ old_protocol = old->protocol;
+ old_nh_flags = old->nh_flags;
+
old->protocol = new->protocol;
old->nh_flags = new->nh_flags;
rcu_assign_pointer(old->nh_info, newi);
rcu_assign_pointer(new->nh_info, oldi);
+ /* Send a replace notification for all the groups using the nexthop. */
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ struct nexthop *nhp = nhge->nh_parent;
+
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
+ extack);
+ if (err)
+ goto err_notify;
+ }
+
+ /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
+ * update IPv4 indication in all the groups using the nexthop.
+ */
+ if (oldi->family == AF_INET && newi->family == AF_INET6) {
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ struct nexthop *nhp = nhge->nh_parent;
+ struct nh_group *nhg;
+
+ nhg = rtnl_dereference(nhp->nh_grp);
+ nh_group_v4_update(nhg);
+ }
+ }
+
return 0;
+
+err_notify:
+ rcu_assign_pointer(new->nh_info, newi);
+ rcu_assign_pointer(old->nh_info, oldi);
+ old->nh_flags = old_nh_flags;
+ old->protocol = old_protocol;
+ oldi->nh_parent = old;
+ newi->nh_parent = new;
+ list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) {
+ struct nexthop *nhp = nhge->nh_parent;
+
+ call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, extack);
+ }
+ call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack);
+ return err;
}
static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
@@ -1101,7 +1314,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh,
while (1) {
struct nexthop *nh;
- next = rtnl_dereference(*pp);
+ next = *pp;
if (!next)
break;
@@ -1133,7 +1346,11 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh,
rb_link_node_rcu(&new_nh->rb_node, parent, pp);
rb_insert_color(&new_nh->rb_node, root);
- rc = 0;
+
+ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack);
+ if (rc)
+ rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);
+
out:
if (!rc) {
nh_base_seq_inc(net);
@@ -1242,8 +1459,10 @@ static struct nexthop *nexthop_create_group(struct net *net,
return nh;
out_no_nh:
- for (; i >= 0; --i)
+ for (i--; i >= 0; --i) {
+ list_del(&nhg->nh_entries[i].nh_list);
nexthop_put(nhg->nh_entries[i].nh);
+ }
kfree(nhg->spare);
kfree(nhg);
@@ -1922,19 +2141,71 @@ static struct notifier_block nh_netdev_notifier = {
.notifier_call = nh_netdev_event,
};
-int register_nexthop_notifier(struct net *net, struct notifier_block *nb)
+static int nexthops_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ struct rb_root *root = &net->nexthop.rb_root;
+ struct rb_node *node;
+ int err = 0;
+
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ struct nexthop *nh;
+
+ nh = rb_entry(node, struct nexthop, rb_node);
+ err = call_nexthop_notifier(nb, net, NEXTHOP_EVENT_REPLACE, nh,
+ extack);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb);
+ int err;
+
+ rtnl_lock();
+ err = nexthops_dump(net, nb, extack);
+ if (err)
+ goto unlock;
+ err = blocking_notifier_chain_register(&net->nexthop.notifier_chain,
+ nb);
+unlock:
+ rtnl_unlock();
+ return err;
}
EXPORT_SYMBOL(register_nexthop_notifier);
int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain,
- nb);
+ return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
+ nb);
}
EXPORT_SYMBOL(unregister_nexthop_notifier);
+void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap)
+{
+ struct nexthop *nexthop;
+
+ rcu_read_lock();
+
+ nexthop = nexthop_find_by_id(net, id);
+ if (!nexthop)
+ goto out;
+
+ nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+ if (offload)
+ nexthop->nh_flags |= RTNH_F_OFFLOAD;
+ if (trap)
+ nexthop->nh_flags |= RTNH_F_TRAP;
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_set_hw_flags);
+
static void __net_exit nexthop_net_exit(struct net *net)
{
rtnl_lock();
@@ -1951,7 +2222,7 @@ static int __net_init nexthop_net_init(struct net *net)
net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
if (!net->nexthop.devhash)
return -ENOMEM;
- ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
+ BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
return 0;
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index df6fbefe44d4..8b943f85fff9 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -293,7 +293,8 @@ EXPORT_SYMBOL_GPL(ping_close);
/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
- struct sockaddr *uaddr, int addr_len) {
+ struct sockaddr *uaddr, int addr_len)
+{
struct net *net = sock_net(sk);
if (sk->sk_family == AF_INET) {
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
@@ -310,10 +311,10 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
- chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
-
if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
chk_addr_ret = RTN_LOCAL;
+ else
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
if ((!inet_can_nonlocal_bind(net, isk) &&
chk_addr_ret != RTN_LOCAL) ||
@@ -383,20 +384,6 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
}
}
-static void ping_clear_saddr(struct sock *sk, int dif)
-{
- sk->sk_bound_dev_if = dif;
- if (sk->sk_family == AF_INET) {
- struct inet_sock *isk = inet_sk(sk);
- isk->inet_rcv_saddr = isk->inet_saddr = 0;
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
- memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
- memset(&np->saddr, 0, sizeof(np->saddr));
-#endif
- }
-}
/*
* We need our own bind because there are no privileged id's == local ports.
* Moreover, we don't allow binding to multi- and broadcast addresses.
@@ -420,12 +407,13 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto out;
err = -EADDRINUSE;
- ping_set_saddr(sk, uaddr);
snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port);
if (ping_get_port(sk, snum) != 0) {
- ping_clear_saddr(sk, dif);
+ /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */
+ sk->sk_bound_dev_if = dif;
goto out;
}
+ ping_set_saddr(sk, uaddr);
pr_debug("after bind(): num = %hu, dif = %d\n",
isk->inet_num,
@@ -647,7 +635,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
}
int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
- void *user_icmph, size_t icmph_len) {
+ void *user_icmph, size_t icmph_len)
+{
u8 type, code;
if (len > 0xFFFF)
@@ -789,7 +778,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4.fl4_icmp_type = user_icmph.type;
fl4.fl4_icmp_code = user_icmph.code;
- security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8d5e1695b9aa..63cd370ea29d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -167,6 +167,7 @@ static const struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI),
+ SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 407956be7deb..50a73178d63a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -260,11 +260,12 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
err = EHOSTUNREACH;
if (code > NR_ICMP_UNREACH)
break;
- err = icmp_err_convert[code].errno;
- harderr = icmp_err_convert[code].fatal;
if (code == ICMP_FRAG_NEEDED) {
harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
err = EMSGSIZE;
+ } else {
+ err = icmp_err_convert[code].errno;
+ harderr = icmp_err_convert[code].fatal;
}
}
@@ -478,7 +479,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
skb->csum = csum_block_add(
skb->csum,
csum_partial_copy_nocheck(rfv->hdr.c + offset,
- to, copy, 0),
+ to, copy),
odd);
odd = 0;
@@ -639,7 +640,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto done;
}
- security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 58642b29a499..e26652ff7059 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -623,7 +623,7 @@ static inline u32 fnhe_hashfun(__be32 daddr)
u32 hval;
net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
- hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
+ hval = jhash_1word((__force u32)daddr, fnhe_hashrnd);
return hash_32(hval, FNHE_HASH_SHIFT);
}
@@ -1016,13 +1016,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
struct dst_entry *dst = &rt->dst;
struct net *net = dev_net(dst->dev);
- u32 old_mtu = ipv4_mtu(dst);
struct fib_result res;
bool lock = false;
+ u32 old_mtu;
if (ip_mtu_locked(dst))
return;
+ old_mtu = ipv4_mtu(dst);
if (old_mtu < mtu)
return;
@@ -1066,7 +1067,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
int oif, u8 protocol)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
u32 mark = IP4_REPLY_MARK(net, skb->mark);
@@ -1083,7 +1084,7 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
@@ -1101,7 +1102,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
struct dst_entry *odst = NULL;
@@ -1131,7 +1132,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
new = true;
}
- __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
+ __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
if (!dst_check(&rt->dst, 0)) {
if (new)
@@ -1156,7 +1157,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net,
int oif, u8 protocol)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
@@ -1172,7 +1173,7 @@ EXPORT_SYMBOL_GPL(ipv4_redirect);
void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
struct net *net = sock_net(sk);
@@ -1312,7 +1313,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
static unsigned int ipv4_mtu(const struct dst_entry *dst)
{
- const struct rtable *rt = (const struct rtable *) dst;
+ const struct rtable *rt = (const struct rtable *)dst;
unsigned int mtu = rt->rt_pmtu;
if (!mtu || time_after_eq(jiffies, rt->dst.expires))
@@ -1740,7 +1741,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
flags |= RTCF_LOCAL;
rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+ IN_DEV_ORCONF(in_dev, NOPOLICY), false);
if (!rth)
return -ENOBUFS;
@@ -1856,8 +1857,8 @@ static int __mkroute_input(struct sk_buff *skb,
}
rth = rt_dst_alloc(out_dev->dev, 0, res->type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(out_dev, NOXFRM));
+ IN_DEV_ORCONF(in_dev, NOPOLICY),
+ IN_DEV_ORCONF(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
goto cleanup;
@@ -2226,7 +2227,7 @@ local_input:
rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
flags | RTCF_LOCAL, res->type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+ IN_DEV_ORCONF(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
@@ -2449,8 +2450,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
add:
rth = rt_dst_alloc(dev_out, flags, type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(in_dev, NOXFRM));
+ IN_DEV_ORCONF(in_dev, NOPOLICY),
+ IN_DEV_ORCONF(in_dev, NOXFRM));
if (!rth)
return ERR_PTR(-ENOBUFS);
@@ -2769,10 +2770,12 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
if (IS_ERR(rt))
return rt;
- if (flp4->flowi4_proto)
+ if (flp4->flowi4_proto) {
+ flp4->flowi4_oif = rt->dst.dev->ifindex;
rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
flowi4_to_flowi(flp4),
sk, 0);
+ }
return rt;
}
@@ -2869,6 +2872,9 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
if (rt->dst.dev &&
nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
goto nla_put_failure;
+ if (rt->dst.lwtstate &&
+ lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
if (rt->dst.tclassid &&
nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
@@ -3219,7 +3225,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.daddr = dst;
fl4.saddr = src;
- fl4.flowi4_tos = rtm->rtm_tos;
+ fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
@@ -3243,8 +3249,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.flowi4_iif = iif; /* for rt_fill_info */
skb->dev = dev;
skb->mark = mark;
- err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
- dev, &res);
+ err = ip_route_input_rcu(skb, dst, src,
+ rtm->rtm_tos & IPTOS_RT_MASK, dev,
+ &res);
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e03756631541..33792cf55a79 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -286,11 +286,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
struct sock *sk,
struct sk_buff *skb)
{
+ struct tcp_request_sock *treq;
struct request_sock *req;
#ifdef CONFIG_MPTCP
- struct tcp_request_sock *treq;
-
if (sk_is_mptcp(sk))
ops = &mptcp_subflow_request_sock_ops;
#endif
@@ -299,8 +298,9 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
if (!req)
return NULL;
-#if IS_ENABLED(CONFIG_MPTCP)
treq = tcp_rsk(req);
+ treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
+#if IS_ENABLED(CONFIG_MPTCP)
treq->is_mptcp = sk_is_mptcp(sk);
if (treq->is_mptcp) {
int err = mptcp_subflow_init_cookie_req(req, sk, skb);
@@ -331,7 +331,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
__u32 cookie = ntohl(th->ack_seq) - 1;
struct sock *ret = sk;
struct request_sock *req;
- int mss;
+ int full_space, mss;
struct rtable *rt;
__u8 rcv_wscale;
struct flowi4 fl4;
@@ -418,7 +418,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
- security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+ security_req_classify_flow(req, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
reqsk_free(req);
@@ -427,8 +427,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
/* Try to redo what tcp_v4_send_synack did. */
req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ full_space = tcp_full_space(sk);
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
- tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
+ tcp_select_initial_window(sk, full_space, req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 54023a46db04..3e5f4f2e705e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1330,6 +1330,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &comp_sack_nr_max,
},
{
+ .procname = "tcp_reflect_tos",
+ .data = &init_net.ipv4.sysctl_tcp_reflect_tos,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
.maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2135ee7c806d..32545ecf2ab1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -418,6 +418,8 @@ void tcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto_min = TCP_RTO_MIN;
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
@@ -483,6 +485,8 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
return true;
if (tcp_rmem_pressure(sk))
return true;
+ if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
+ return true;
}
if (sk->sk_prot->stream_memory_read)
return sk->sk_prot->stream_memory_read(sk);
@@ -562,7 +566,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
mask |= EPOLLIN | EPOLLRDNORM;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_is_writeable(sk)) {
+ if (__sk_stream_is_writeable(sk, 1)) {
mask |= EPOLLOUT | EPOLLWRNORM;
} else { /* send SIGIO later */
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -574,7 +578,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* pairs with the input side.
*/
smp_mb__after_atomic();
- if (sk_stream_is_writeable(sk))
+ if (__sk_stream_is_writeable(sk, 1))
mask |= EPOLLOUT | EPOLLWRNORM;
}
} else
@@ -950,7 +954,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
* importantly be able to generate EPOLLOUT for Edge Trigger epoll()
* users.
*/
-static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
+void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
{
if (skb && !skb->len) {
tcp_unlink_write_queue(skb, sk);
@@ -960,6 +964,68 @@ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
}
}
+struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
+ struct page *page, int offset, size_t *size)
+{
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool can_coalesce;
+ int copy, i;
+
+ if (!skb || (copy = size_goal - skb->len) <= 0 ||
+ !tcp_skb_can_collapse_to(skb)) {
+new_segment:
+ if (!sk_stream_memory_free(sk))
+ return NULL;
+
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
+ tcp_rtx_and_write_queues_empty(sk));
+ if (!skb)
+ return NULL;
+
+#ifdef CONFIG_TLS_DEVICE
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
+ skb_entail(sk, skb);
+ copy = size_goal;
+ }
+
+ if (copy > *size)
+ copy = *size;
+
+ i = skb_shinfo(skb)->nr_frags;
+ can_coalesce = skb_can_coalesce(skb, i, page, offset);
+ if (!can_coalesce && i >= sysctl_max_skb_frags) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ if (!sk_wmem_schedule(sk, copy))
+ return NULL;
+
+ if (can_coalesce) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, copy);
+ }
+
+ if (!(flags & MSG_NO_SHARED_FRAGS))
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ sk_wmem_queued_add(sk, copy);
+ sk_mem_charge(sk, copy);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
+ TCP_SKB_CB(skb)->end_seq += copy;
+ tcp_skb_pcount_set(skb, 0);
+
+ *size = copy;
+ return skb;
+}
+
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
@@ -995,59 +1061,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
goto out_err;
while (size > 0) {
- struct sk_buff *skb = tcp_write_queue_tail(sk);
- int copy, i;
- bool can_coalesce;
-
- if (!skb || (copy = size_goal - skb->len) <= 0 ||
- !tcp_skb_can_collapse_to(skb)) {
-new_segment:
- if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
-
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
- tcp_rtx_and_write_queues_empty(sk));
- if (!skb)
- goto wait_for_memory;
-
-#ifdef CONFIG_TLS_DEVICE
- skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
-#endif
- skb_entail(sk, skb);
- copy = size_goal;
- }
-
- if (copy > size)
- copy = size;
-
- i = skb_shinfo(skb)->nr_frags;
- can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= sysctl_max_skb_frags) {
- tcp_mark_push(tp, skb);
- goto new_segment;
- }
- if (!sk_wmem_schedule(sk, copy))
- goto wait_for_memory;
-
- if (can_coalesce) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
- } else {
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, copy);
- }
+ struct sk_buff *skb;
+ size_t copy = size;
- if (!(flags & MSG_NO_SHARED_FRAGS))
- skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-
- skb->len += copy;
- skb->data_len += copy;
- skb->truesize += copy;
- sk_wmem_queued_add(sk, copy);
- sk_mem_charge(sk, copy);
- skb->ip_summed = CHECKSUM_PARTIAL;
- WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
- TCP_SKB_CB(skb)->end_seq += copy;
- tcp_skb_pcount_set(skb, 0);
+ skb = tcp_build_frag(sk, size_goal, flags, page, offset, &copy);
+ if (!skb)
+ goto wait_for_space;
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
@@ -1068,9 +1087,8 @@ new_segment:
tcp_push_one(sk, mss_now);
continue;
-wait_for_sndbuf:
+wait_for_space:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
@@ -1281,7 +1299,7 @@ restart:
new_segment:
if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
+ goto wait_for_space;
if (unlikely(process_backlog >= 16)) {
process_backlog = 0;
@@ -1292,7 +1310,7 @@ new_segment:
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
first_skb);
if (!skb)
- goto wait_for_memory;
+ goto wait_for_space;
process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -1325,7 +1343,7 @@ new_segment:
struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag))
- goto wait_for_memory;
+ goto wait_for_space;
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
@@ -1339,7 +1357,7 @@ new_segment:
copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (!sk_wmem_schedule(sk, copy))
- goto wait_for_memory;
+ goto wait_for_space;
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
pfrag->page,
@@ -1392,9 +1410,8 @@ new_segment:
tcp_push_one(sk, mss_now);
continue;
-wait_for_sndbuf:
+wait_for_space:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
@@ -1526,7 +1543,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-static void tcp_cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
@@ -1539,10 +1556,8 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
if (inet_csk_ack_scheduled(sk)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
- /* Delayed ACKs frequently hit locked sockets during bulk
- * receive. */
- if (icsk->icsk_ack.blocked ||
- /* Once-per-two-segments ACK was not sent by tcp_input.c */
+
+ if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
/*
* If this read emptied read buffer, we send ACK, if
@@ -1743,52 +1758,272 @@ int tcp_mmap(struct file *file, struct socket *sock,
}
EXPORT_SYMBOL(tcp_mmap);
+static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
+ u32 *offset_frag)
+{
+ skb_frag_t *frag;
+
+ offset_skb -= skb_headlen(skb);
+ if ((int)offset_skb < 0 || skb_has_frag_list(skb))
+ return NULL;
+
+ frag = skb_shinfo(skb)->frags;
+ while (offset_skb) {
+ if (skb_frag_size(frag) > offset_skb) {
+ *offset_frag = offset_skb;
+ return frag;
+ }
+ offset_skb -= skb_frag_size(frag);
+ ++frag;
+ }
+ *offset_frag = 0;
+ return frag;
+}
+
+static bool can_map_frag(const skb_frag_t *frag)
+{
+ return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
+}
+
+static int find_next_mappable_frag(const skb_frag_t *frag,
+ int remaining_in_skb)
+{
+ int offset = 0;
+
+ if (likely(can_map_frag(frag)))
+ return 0;
+
+ while (offset < remaining_in_skb && !can_map_frag(frag)) {
+ offset += skb_frag_size(frag);
+ ++frag;
+ }
+ return offset;
+}
+
+static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
+ struct tcp_zerocopy_receive *zc,
+ struct sk_buff *skb, u32 offset)
+{
+ u32 frag_offset, partial_frag_remainder = 0;
+ int mappable_offset;
+ skb_frag_t *frag;
+
+ /* worst case: skip to next skb. try to improve on this case below */
+ zc->recv_skip_hint = skb->len - offset;
+
+ /* Find the frag containing this offset (and how far into that frag) */
+ frag = skb_advance_to_frag(skb, offset, &frag_offset);
+ if (!frag)
+ return;
+
+ if (frag_offset) {
+ struct skb_shared_info *info = skb_shinfo(skb);
+
+ /* We read part of the last frag, must recvmsg() rest of skb. */
+ if (frag == &info->frags[info->nr_frags - 1])
+ return;
+
+ /* Else, we must at least read the remainder in this frag. */
+ partial_frag_remainder = skb_frag_size(frag) - frag_offset;
+ zc->recv_skip_hint -= partial_frag_remainder;
+ ++frag;
+ }
+
+ /* partial_frag_remainder: If part way through a frag, must read rest.
+ * mappable_offset: Bytes till next mappable frag, *not* counting bytes
+ * in partial_frag_remainder.
+ */
+ mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
+ zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
+}
+
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags,
+ struct scm_timestamping_internal *tss,
+ int *cmsg_flags);
+static int receive_fallback_to_copy(struct sock *sk,
+ struct tcp_zerocopy_receive *zc, int inq)
+{
+ unsigned long copy_address = (unsigned long)zc->copybuf_address;
+ struct scm_timestamping_internal tss_unused;
+ int err, cmsg_flags_unused;
+ struct msghdr msg = {};
+ struct iovec iov;
+
+ zc->length = 0;
+ zc->recv_skip_hint = 0;
+
+ if (copy_address != zc->copybuf_address)
+ return -EINVAL;
+
+ err = import_single_range(READ, (void __user *)copy_address,
+ inq, &iov, &msg.msg_iter);
+ if (err)
+ return err;
+
+ err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0,
+ &tss_unused, &cmsg_flags_unused);
+ if (err < 0)
+ return err;
+
+ zc->copybuf_len = err;
+ if (likely(zc->copybuf_len)) {
+ struct sk_buff *skb;
+ u32 offset;
+
+ skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
+ if (skb)
+ tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
+ }
+ return 0;
+}
+
+static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
+ struct sk_buff *skb, u32 copylen,
+ u32 *offset, u32 *seq)
+{
+ unsigned long copy_address = (unsigned long)zc->copybuf_address;
+ struct msghdr msg = {};
+ struct iovec iov;
+ int err;
+
+ if (copy_address != zc->copybuf_address)
+ return -EINVAL;
+
+ err = import_single_range(READ, (void __user *)copy_address,
+ copylen, &iov, &msg.msg_iter);
+ if (err)
+ return err;
+ err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
+ if (err)
+ return err;
+ zc->recv_skip_hint -= copylen;
+ *offset += copylen;
+ *seq += copylen;
+ return (__s32)copylen;
+}
+
+static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
+ struct sock *sk,
+ struct sk_buff *skb,
+ u32 *seq,
+ s32 copybuf_len)
+{
+ u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
+
+ if (!copylen)
+ return 0;
+ /* skb is null if inq < PAGE_SIZE. */
+ if (skb)
+ offset = *seq - TCP_SKB_CB(skb)->seq;
+ else
+ skb = tcp_recv_skb(sk, *seq, &offset);
+
+ zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
+ seq);
+ return zc->copybuf_len < 0 ? 0 : copylen;
+}
+
+static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
+ struct page **pending_pages,
+ unsigned long pages_remaining,
+ unsigned long *address,
+ u32 *length,
+ u32 *seq,
+ struct tcp_zerocopy_receive *zc,
+ u32 total_bytes_to_map,
+ int err)
+{
+ /* At least one page did not map. Try zapping if we skipped earlier. */
+ if (err == -EBUSY &&
+ zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
+ u32 maybe_zap_len;
+
+ maybe_zap_len = total_bytes_to_map - /* All bytes to map */
+ *length + /* Mapped or pending */
+ (pages_remaining * PAGE_SIZE); /* Failed map. */
+ zap_page_range(vma, *address, maybe_zap_len);
+ err = 0;
+ }
+
+ if (!err) {
+ unsigned long leftover_pages = pages_remaining;
+ int bytes_mapped;
+
+ /* We called zap_page_range, try to reinsert. */
+ err = vm_insert_pages(vma, *address,
+ pending_pages,
+ &pages_remaining);
+ bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
+ *seq += bytes_mapped;
+ *address += bytes_mapped;
+ }
+ if (err) {
+ /* Either we were unable to zap, OR we zapped, retried an
+ * insert, and still had an issue. Either ways, pages_remaining
+ * is the number of pages we were unable to map, and we unroll
+ * some state we speculatively touched before.
+ */
+ const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+
+ *length -= bytes_not_mapped;
+ zc->recv_skip_hint += bytes_not_mapped;
+ }
+ return err;
+}
+
static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
struct page **pages,
- unsigned long pages_to_map,
- unsigned long *insert_addr,
- u32 *length_with_pending,
+ unsigned int pages_to_map,
+ unsigned long *address,
+ u32 *length,
u32 *seq,
- struct tcp_zerocopy_receive *zc)
+ struct tcp_zerocopy_receive *zc,
+ u32 total_bytes_to_map)
{
unsigned long pages_remaining = pages_to_map;
- int bytes_mapped;
- int ret;
+ unsigned int pages_mapped;
+ unsigned int bytes_mapped;
+ int err;
- ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
- bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
+ err = vm_insert_pages(vma, *address, pages, &pages_remaining);
+ pages_mapped = pages_to_map - (unsigned int)pages_remaining;
+ bytes_mapped = PAGE_SIZE * pages_mapped;
/* Even if vm_insert_pages fails, it may have partially succeeded in
* mapping (some but not all of the pages).
*/
*seq += bytes_mapped;
- *insert_addr += bytes_mapped;
- if (ret) {
- /* But if vm_insert_pages did fail, we have to unroll some state
- * we speculatively touched before.
- */
- const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
- *length_with_pending -= bytes_not_mapped;
- zc->recv_skip_hint += bytes_not_mapped;
- }
- return ret;
+ *address += bytes_mapped;
+
+ if (likely(!err))
+ return 0;
+
+ /* Error: maybe zap and retry + rollback state for failed inserts. */
+ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
+ pages_remaining, address, length, seq, zc, total_bytes_to_map,
+ err);
}
+#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc)
{
+ u32 length = 0, offset, vma_len, avail_len, copylen = 0;
unsigned long address = (unsigned long)zc->address;
- u32 length = 0, seq, offset, zap_len;
- #define PAGE_BATCH_SIZE 8
- struct page *pages[PAGE_BATCH_SIZE];
+ struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
+ s32 copybuf_len = zc->copybuf_len;
+ struct tcp_sock *tp = tcp_sk(sk);
const skb_frag_t *frags = NULL;
+ unsigned int pages_to_map = 0;
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
- unsigned long pg_idx = 0;
- unsigned long curr_addr;
- struct tcp_sock *tp;
- int inq;
+ u32 seq = tp->copied_seq;
+ u32 total_bytes_to_map;
+ int inq = tcp_inq(sk);
int ret;
+ zc->copybuf_len = 0;
+
if (address & (PAGE_SIZE - 1) || address != zc->address)
return -EINVAL;
@@ -1797,7 +2032,16 @@ static int tcp_zerocopy_receive(struct sock *sk,
sock_rps_record_flow(sk);
- tp = tcp_sk(sk);
+ if (inq && inq <= copybuf_len)
+ return receive_fallback_to_copy(sk, zc, inq);
+
+ if (inq < PAGE_SIZE) {
+ zc->length = 0;
+ zc->recv_skip_hint = inq;
+ if (!inq && sock_flag(sk, SOCK_DONE))
+ return -EIO;
+ return 0;
+ }
mmap_read_lock(current->mm);
@@ -1806,33 +2050,26 @@ static int tcp_zerocopy_receive(struct sock *sk,
mmap_read_unlock(current->mm);
return -EINVAL;
}
- zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
-
- seq = tp->copied_seq;
- inq = tcp_inq(sk);
- zc->length = min_t(u32, zc->length, inq);
- zap_len = zc->length & ~(PAGE_SIZE - 1);
- if (zap_len) {
- zap_page_range(vma, address, zap_len);
+ vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
+ avail_len = min_t(u32, vma_len, inq);
+ total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
+ if (total_bytes_to_map) {
+ if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
+ zap_page_range(vma, address, total_bytes_to_map);
+ zc->length = total_bytes_to_map;
zc->recv_skip_hint = 0;
} else {
- zc->recv_skip_hint = zc->length;
+ zc->length = avail_len;
+ zc->recv_skip_hint = avail_len;
}
ret = 0;
- curr_addr = address;
while (length + PAGE_SIZE <= zc->length) {
+ int mappable_offset;
+ struct page *page;
+
if (zc->recv_skip_hint < PAGE_SIZE) {
- /* If we're here, finish the current batch. */
- if (pg_idx) {
- ret = tcp_zerocopy_vm_insert_batch(vma, pages,
- pg_idx,
- &curr_addr,
- &length,
- &seq, zc);
- if (ret)
- goto out;
- pg_idx = 0;
- }
+ u32 offset_frag;
+
if (skb) {
if (zc->recv_skip_hint > 0)
break;
@@ -1842,56 +2079,57 @@ static int tcp_zerocopy_receive(struct sock *sk,
skb = tcp_recv_skb(sk, seq, &offset);
}
zc->recv_skip_hint = skb->len - offset;
- offset -= skb_headlen(skb);
- if ((int)offset < 0 || skb_has_frag_list(skb))
+ frags = skb_advance_to_frag(skb, offset, &offset_frag);
+ if (!frags || offset_frag)
break;
- frags = skb_shinfo(skb)->frags;
- while (offset) {
- if (skb_frag_size(frags) > offset)
- goto out;
- offset -= skb_frag_size(frags);
- frags++;
- }
}
- if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
- int remaining = zc->recv_skip_hint;
- while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
- skb_frag_off(frags))) {
- remaining -= skb_frag_size(frags);
- frags++;
- }
- zc->recv_skip_hint -= remaining;
+ mappable_offset = find_next_mappable_frag(frags,
+ zc->recv_skip_hint);
+ if (mappable_offset) {
+ zc->recv_skip_hint = mappable_offset;
break;
}
- pages[pg_idx] = skb_frag_page(frags);
- pg_idx++;
+ page = skb_frag_page(frags);
+ prefetchw(page);
+ pages[pages_to_map++] = page;
length += PAGE_SIZE;
zc->recv_skip_hint -= PAGE_SIZE;
frags++;
- if (pg_idx == PAGE_BATCH_SIZE) {
- ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
- &curr_addr, &length,
- &seq, zc);
+ if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
+ zc->recv_skip_hint < PAGE_SIZE) {
+ /* Either full batch, or we're about to go to next skb
+ * (and we cannot unroll failed ops across skbs).
+ */
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+ pages_to_map,
+ &address, &length,
+ &seq, zc,
+ total_bytes_to_map);
if (ret)
goto out;
- pg_idx = 0;
+ pages_to_map = 0;
}
}
- if (pg_idx) {
- ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
- &curr_addr, &length, &seq,
- zc);
+ if (pages_to_map) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
+ &address, &length, &seq,
+ zc, total_bytes_to_map);
}
out:
mmap_read_unlock(current->mm);
- if (length) {
+ /* Try to copy straggler data. */
+ if (!ret)
+ copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
+ copybuf_len);
+
+ if (length + copylen) {
WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset);
- tcp_cleanup_rbuf(sk, length);
+ tcp_cleanup_rbuf(sk, length + copylen);
ret = 0;
if (length == zc->length)
zc->recv_skip_hint = 0;
@@ -2013,36 +2251,28 @@ static int tcp_inq_hint(struct sock *sk)
* Probably, code can be easily improved even more.
*/
-int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
- int flags, int *addr_len)
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags,
+ struct scm_timestamping_internal *tss,
+ int *cmsg_flags)
{
struct tcp_sock *tp = tcp_sk(sk);
int copied = 0;
u32 peek_seq;
u32 *seq;
unsigned long used;
- int err, inq;
+ int err;
int target; /* Read at least this many bytes */
long timeo;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
- struct scm_timestamping_internal tss;
- int cmsg_flags;
-
- if (unlikely(flags & MSG_ERRQUEUE))
- return inet_recv_error(sk, msg, len, addr_len);
-
- if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
- (sk->sk_state == TCP_ESTABLISHED))
- sk_busy_loop(sk, nonblock);
-
- lock_sock(sk);
err = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;
- cmsg_flags = tp->recvmsg_inq ? 1 : 0;
+ if (tp->recvmsg_inq)
+ *cmsg_flags = 1;
timeo = sock_rcvtimeo(sk, nonblock);
/* Urgent data needs to be handled specially. */
@@ -2222,8 +2452,8 @@ skip_copy:
}
if (TCP_SKB_CB(skb)->has_rxtstamp) {
- tcp_update_recv_tstamps(skb, &tss);
- cmsg_flags |= 2;
+ tcp_update_recv_tstamps(skb, tss);
+ *cmsg_flags |= 2;
}
if (used + offset < skb->len)
@@ -2249,22 +2479,9 @@ found_fin_ok:
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
-
- release_sock(sk);
-
- if (cmsg_flags) {
- if (cmsg_flags & 2)
- tcp_recv_timestamp(msg, sk, &tss);
- if (cmsg_flags & 1) {
- inq = tcp_inq_hint(sk);
- put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
- }
- }
-
return copied;
out:
- release_sock(sk);
return err;
recv_urg:
@@ -2275,6 +2492,36 @@ recv_sndq:
err = tcp_peek_sndq(sk, msg, len);
goto out;
}
+
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+ int flags, int *addr_len)
+{
+ int cmsg_flags = 0, ret, inq;
+ struct scm_timestamping_internal tss;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (sk_can_busy_loop(sk) &&
+ skb_queue_empty_lockless(&sk->sk_receive_queue) &&
+ sk->sk_state == TCP_ESTABLISHED)
+ sk_busy_loop(sk, nonblock);
+
+ lock_sock(sk);
+ ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
+ &cmsg_flags);
+ release_sock(sk);
+
+ if (cmsg_flags && ret >= 0) {
+ if (cmsg_flags & 2)
+ tcp_recv_timestamp(msg, sk, &tss);
+ if (cmsg_flags & 1) {
+ inq = tcp_inq_hint(sk);
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+ }
+ }
+ return ret;
+}
EXPORT_SYMBOL(tcp_recvmsg);
void tcp_set_state(struct sock *sk, int state)
@@ -2405,13 +2652,12 @@ bool tcp_check_oom(struct sock *sk, int shift)
return too_many_orphans || out_of_socket_memory;
}
-void tcp_close(struct sock *sk, long timeout)
+void __tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
int data_was_unread = 0;
int state;
- lock_sock(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
if (sk->sk_state == TCP_LISTEN) {
@@ -2575,6 +2821,12 @@ adjudge_to_death:
out:
bh_unlock_sock(sk);
local_bh_enable();
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+ lock_sock(sk);
+ __tcp_close(sk, timeout);
release_sock(sk);
sock_put(sk);
}
@@ -2685,7 +2937,10 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_backoff = 0;
icsk->icsk_probes_out = 0;
+ icsk->icsk_probes_tstamp = 0;
icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto_min = TCP_RTO_MIN;
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd = TCP_INIT_CWND;
tp->snd_cwnd_cnt = 0;
@@ -2695,6 +2950,7 @@ int tcp_disconnect(struct sock *sk, int flags)
if (icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ icsk->icsk_ca_initialized = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
@@ -3019,6 +3275,21 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val)
}
EXPORT_SYMBOL(tcp_sock_set_keepcnt);
+int tcp_set_window_clamp(struct sock *sk, int val)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!val) {
+ if (sk->sk_state != TCP_CLOSE)
+ return -EINVAL;
+ tp->window_clamp = 0;
+ } else {
+ tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
+ SOCK_MIN_RCVBUF / 2 : val;
+ }
+ return 0;
+}
+
/*
* Socket option code for TCP.
*/
@@ -3046,7 +3317,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
name[val] = 0;
lock_sock(sk);
- err = tcp_set_congestion_control(sk, name, true, true,
+ err = tcp_set_congestion_control(sk, name, true,
ns_capable(sock_net(sk)->user_ns,
CAP_NET_ADMIN));
release_sock(sk);
@@ -3208,7 +3479,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
break;
case TCP_SAVE_SYN:
- if (val < 0 || val > 1)
+ /* 0: disable, 1: enable, 2: start from ether_header */
+ if (val < 0 || val > 2)
err = -EINVAL;
else
tp->save_syn = val;
@@ -3231,15 +3503,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
break;
case TCP_WINDOW_CLAMP:
- if (!val) {
- if (sk->sk_state != TCP_CLOSE) {
- err = -EINVAL;
- break;
- }
- tp->window_clamp = 0;
- } else
- tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
- SOCK_MIN_RCVBUF / 2 : val;
+ err = tcp_set_window_clamp(sk, val);
break;
case TCP_QUICKACK:
@@ -3789,20 +4053,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
lock_sock(sk);
if (tp->saved_syn) {
- if (len < tp->saved_syn[0]) {
- if (put_user(tp->saved_syn[0], optlen)) {
+ if (len < tcp_saved_syn_len(tp->saved_syn)) {
+ if (put_user(tcp_saved_syn_len(tp->saved_syn),
+ optlen)) {
release_sock(sk);
return -EFAULT;
}
release_sock(sk);
return -EINVAL;
}
- len = tp->saved_syn[0];
+ len = tcp_saved_syn_len(tp->saved_syn);
if (put_user(len, optlen)) {
release_sock(sk);
return -EFAULT;
}
- if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+ if (copy_to_user(optval, tp->saved_syn->data, len)) {
release_sock(sk);
return -EFAULT;
}
@@ -3818,7 +4083,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
}
#ifdef CONFIG_MMU
case TCP_ZEROCOPY_RECEIVE: {
- struct tcp_zerocopy_receive zc;
+ struct tcp_zerocopy_receive zc = {};
int err;
if (get_user(len, optlen))
@@ -3835,7 +4100,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
lock_sock(sk);
err = tcp_zerocopy_receive(sk, &zc);
release_sock(sk);
- if (len == sizeof(zc))
+ if (len >= offsetofend(struct tcp_zerocopy_receive, err))
goto zerocopy_rcv_sk_err;
switch (len) {
case offsetofend(struct tcp_zerocopy_receive, err):
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 6c4d79baff26..6ea3dc2e4219 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -945,7 +945,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
filter_expired = after(tcp_jiffies32,
bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
if (rs->rtt_us >= 0 &&
- (rs->rtt_us <= bbr->min_rtt_us ||
+ (rs->rtt_us < bbr->min_rtt_us ||
(filter_expired && !rs->is_ack_delayed))) {
bbr->min_rtt_us = rs->rtt_us;
bbr->min_rtt_stamp = tcp_jiffies32;
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 7aa68f4aae6c..bc7d2a586e18 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -15,8 +15,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
{
struct iov_iter *iter = &msg->msg_iter;
int peek = flags & MSG_PEEK;
- int i, ret, copied = 0;
struct sk_msg *msg_rx;
+ int i, copied = 0;
msg_rx = list_first_entry_or_null(&psock->ingress_msg,
struct sk_msg, list);
@@ -37,17 +37,16 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
page = sg_page(sge);
if (copied + copy > len)
copy = len - copied;
- ret = copy_page_to_iter(page, sge->offset, copy, iter);
- if (ret != copy) {
- msg_rx->sg.start = i;
- return -EFAULT;
- }
+ copy = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (!copy)
+ return copied ? copied : -EFAULT;
copied += copy;
if (likely(!peek)) {
sge->offset += copy;
sge->length -= copy;
- sk_mem_uncharge(sk, copy);
+ if (!msg_rx->skb)
+ sk_mem_uncharge(sk, copy);
msg_rx->sg.size -= copy;
if (!sge->length) {
@@ -56,6 +55,11 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
put_page(page);
}
} else {
+ /* Lets not optimize peek case if copy_page_to_iter
+ * didn't copy the entire length lets just break.
+ */
+ if (copy != sge->length)
+ return copied;
sk_msg_iter_var_next(i);
}
@@ -567,10 +571,9 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage;
}
-static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops)
{
- if (sk->sk_family == AF_INET6 &&
- unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+ if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
spin_lock_bh(&tcpv6_prot_lock);
if (likely(ops != tcpv6_prot_saved)) {
tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
@@ -603,13 +606,11 @@ struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
- if (!psock->sk_proto) {
- struct proto *ops = READ_ONCE(sk->sk_prot);
-
- if (tcp_bpf_assert_proto_ops(ops))
+ if (sk->sk_family == AF_INET6) {
+ if (tcp_bpf_assert_proto_ops(psock->sk_proto))
return ERR_PTR(-EINVAL);
- tcp_bpf_check_v6_needs_rebuild(sk, ops);
+ tcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
}
return &tcp_bpf_prots[family][config];
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 62878cf26d9c..563d016e7478 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -176,7 +176,7 @@ void tcp_assign_congestion_control(struct sock *sk)
void tcp_init_congestion_control(struct sock *sk)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
tcp_sk(sk)->prior_ssthresh = 0;
if (icsk->icsk_ca_ops->init)
@@ -185,6 +185,7 @@ void tcp_init_congestion_control(struct sock *sk)
INET_ECN_xmit(sk);
else
INET_ECN_dontxmit(sk);
+ icsk->icsk_ca_initialized = 1;
}
static void tcp_reinit_congestion_control(struct sock *sk,
@@ -197,6 +198,11 @@ static void tcp_reinit_congestion_control(struct sock *sk,
icsk->icsk_ca_setsockopt = 1;
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ if (ca->flags & TCP_CONG_NEEDS_ECN)
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
+
if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
tcp_init_congestion_control(sk);
}
@@ -340,7 +346,7 @@ out:
* already initialized.
*/
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
- bool reinit, bool cap_net_admin)
+ bool cap_net_admin)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca;
@@ -361,28 +367,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
goto out;
}
- if (!ca) {
+ if (!ca)
err = -ENOENT;
- } else if (!load) {
- const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
-
- if (bpf_try_module_get(ca, ca->owner)) {
- if (reinit) {
- tcp_reinit_congestion_control(sk, ca);
- } else {
- icsk->icsk_ca_ops = ca;
- bpf_module_put(old_ca, old_ca->owner);
- }
- } else {
- err = -EBUSY;
- }
- } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) {
+ else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin))
err = -EPERM;
- } else if (!bpf_try_module_get(ca, ca->owner)) {
+ else if (!bpf_try_module_get(ca, ca->owner))
err = -EBUSY;
- } else {
+ else
tcp_reinit_congestion_control(sk, ca);
- }
out:
rcu_read_unlock();
return err;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 09b62de04eea..af2814c9342a 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -295,7 +295,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
refcount_set(&req->rsk_refcnt, 2);
/* Now finish processing the fastopen child socket. */
- tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
+ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b1ce2054291d..9b44caa4b956 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -138,6 +138,69 @@ void clean_acked_data_flush(void)
EXPORT_SYMBOL_GPL(clean_acked_data_flush);
#endif
+#ifdef CONFIG_CGROUP_BPF
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+ bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
+ BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
+ bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
+ struct bpf_sock_ops_kern sock_ops;
+
+ if (likely(!unknown_opt && !parse_all_opt))
+ return;
+
+ /* The skb will be handled in the
+ * bpf_skops_established() or
+ * bpf_skops_write_hdr_opt().
+ */
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_SYN_SENT:
+ case TCP_LISTEN:
+ return;
+ }
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = bpf_op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+#else
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+}
+#endif
+
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
unsigned int len)
{
@@ -447,7 +510,6 @@ static void tcp_init_buffer_space(struct sock *sk)
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
tcp_sndbuf_expand(sk);
- tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss);
tcp_mstamp_refresh(tp);
tp->rcvq_space.time = tp->tcp_mstamp;
tp->rcvq_space.seq = tp->copied_seq;
@@ -471,6 +533,8 @@ static void tcp_init_buffer_space(struct sock *sk)
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
tp->snd_cwnd_stamp = tcp_jiffies32;
+ tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
+ (u32)TCP_INIT_CWND * tp->advmss);
}
/* 4. Recalculate window clamp after socket hit its memory bounds. */
@@ -956,7 +1020,11 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
-/* This must be called before lost_out is incremented */
+ /* This must be called before lost_out or retrans_out are updated
+ * on a new loss, because we want to know if all skbs previously
+ * known to be lost have already been retransmitted, indicating
+ * that this newly lost skb is our next skb to retransmit.
+ */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
@@ -966,41 +1034,36 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
tp->retransmit_skb_hint = skb;
}
-/* Sum the number of packets on the wire we have marked as lost.
- * There are two cases we care about here:
- * a) Packet hasn't been marked lost (nor retransmitted),
- * and this is the first loss.
- * b) Packet has been marked both lost and retransmitted,
- * and this means we think it was lost again.
+/* Sum the number of packets on the wire we have marked as lost, and
+ * notify the congestion control module that the given skb was marked lost.
*/
-static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
{
- __u8 sacked = TCP_SKB_CB(skb)->sacked;
-
- if (!(sacked & TCPCB_LOST) ||
- ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
- tp->lost += tcp_skb_pcount(skb);
+ tp->lost += tcp_skb_pcount(skb);
}
-static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
- tcp_verify_retransmit_hint(tp, skb);
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+ struct tcp_sock *tp = tcp_sk(sk);
- tp->lost_out += tcp_skb_pcount(skb);
- tcp_sum_lost(tp, skb);
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- }
-}
+ if (sacked & TCPCB_SACKED_ACKED)
+ return;
-void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
-{
tcp_verify_retransmit_hint(tp, skb);
-
- tcp_sum_lost(tp, skb);
- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+ if (sacked & TCPCB_LOST) {
+ if (sacked & TCPCB_SACKED_RETRANS) {
+ /* Account for retransmits that are lost again */
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
+ tcp_skb_pcount(skb));
+ tcp_notify_skb_loss_event(tp, skb);
+ }
+ } else {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tcp_notify_skb_loss_event(tp, skb);
}
}
@@ -2263,7 +2326,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
if (cnt > packets)
break;
- tcp_skb_mark_lost(tp, skb);
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
+ tcp_mark_skb_lost(sk, skb);
if (mark_head)
break;
@@ -2483,7 +2547,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
* 1) If the packets in flight is larger than ssthresh, PRR spreads the
* cwnd reductions across a full RTT.
* 2) Otherwise PRR uses packet conservation to send as much as delivered.
- * But when the retransmits are acked without further losses, PRR
+ * But when SND_UNA is acked without further losses,
* slow starts cwnd up to ssthresh to speed up the recovery.
*/
static void tcp_init_cwnd_reduction(struct sock *sk)
@@ -2500,7 +2564,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tcp_ecn_queue_cwr(tp);
}
-void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
@@ -2514,8 +2578,7 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) ==
- FLAG_RETRANS_DATA_ACKED) {
+ } else if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
@@ -2626,17 +2689,26 @@ void tcp_simple_retransmit(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- unsigned int mss = tcp_current_mss(sk);
+ int mss;
+
+ /* A fastopen SYN request is stored as two separate packets within
+ * the retransmit queue, this is done by tcp_send_syn_data().
+ * As a result simply checking the MSS of the frames in the queue
+ * will not work for the SYN packet.
+ *
+ * Us being here is an indication of a path MTU issue so we can
+ * assume that the fastopen SYN was lost and just mark all the
+ * frames in the retransmit queue as lost. We will use an MSS of
+ * -1 to mark all frames as lost, otherwise compute the current MSS.
+ */
+ if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
+ mss = -1;
+ else
+ mss = tcp_current_mss(sk);
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
- if (tcp_skb_seglen(skb) > mss &&
- !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- }
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- }
+ if (tcp_skb_seglen(skb) > mss)
+ tcp_mark_skb_lost(sk, skb);
}
tcp_clear_retrans_hints_partial(tp);
@@ -2787,7 +2859,8 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
} else if (tcp_is_rack(sk)) {
u32 prior_retrans = tp->retrans_out;
- tcp_rack_mark_lost(sk);
+ if (tcp_rack_mark_lost(sk))
+ *ack_flag &= ~FLAG_SET_XMIT_TIMER;
if (prior_retrans > tp->retrans_out)
*ack_flag |= FLAG_LOST_RETRANS;
}
@@ -3312,6 +3385,7 @@ static void tcp_ack_probe(struct sock *sk)
return;
if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0;
+ icsk->icsk_probes_tstamp = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
@@ -3319,8 +3393,8 @@ static void tcp_ack_probe(struct sock *sk)
} else {
unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
- tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- when, TCP_RTO_MAX);
+ when = tcp_clamp_probe0_to_user_timeout(sk, when);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
}
}
@@ -3362,7 +3436,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
- tcp_cwnd_reduction(sk, acked_sacked, flag);
+ tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag);
} else if (tcp_may_raise_cwnd(sk, flag)) {
/* Advance cwnd if state allows */
tcp_cong_avoid(sk, ack, acked_sacked);
@@ -3743,9 +3817,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
- /* If needed, reset TLP/RTO timer; RACK may later override this. */
- if (flag & FLAG_SET_XMIT_TIMER)
- tcp_set_xmit_timer(sk);
if (tcp_ack_is_dubious(sk, flag)) {
if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) {
@@ -3758,6 +3829,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
&rexmit);
}
+ /* If needed, reset TLP/RTO timer when RACK doesn't set. */
+ if (flag & FLAG_SET_XMIT_TIMER)
+ tcp_set_xmit_timer(sk);
+
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
@@ -3819,7 +3894,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
foc->exp = exp_opt;
}
-static void smc_parse_options(const struct tcphdr *th,
+static bool smc_parse_options(const struct tcphdr *th,
struct tcp_options_received *opt_rx,
const unsigned char *ptr,
int opsize)
@@ -3828,10 +3903,13 @@ static void smc_parse_options(const struct tcphdr *th,
if (static_branch_unlikely(&tcp_have_smc)) {
if (th->syn && !(opsize & 1) &&
opsize >= TCPOLEN_EXP_SMC_BASE &&
- get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
opt_rx->smc_ok = 1;
+ return true;
+ }
}
#endif
+ return false;
}
/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
@@ -3892,6 +3970,7 @@ void tcp_parse_options(const struct net *net,
ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
+ opt_rx->saw_unknown = 0;
while (length > 0) {
int opcode = *ptr++;
@@ -3982,15 +4061,21 @@ void tcp_parse_options(const struct net *net,
*/
if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
get_unaligned_be16(ptr) ==
- TCPOPT_FASTOPEN_MAGIC)
+ TCPOPT_FASTOPEN_MAGIC) {
tcp_parse_fastopen_option(opsize -
TCPOLEN_EXP_FASTOPEN_BASE,
ptr + 2, th->syn, foc, true);
- else
- smc_parse_options(th, opt_rx, ptr,
- opsize);
+ break;
+ }
+
+ if (smc_parse_options(th, opt_rx, ptr, opsize))
+ break;
+
+ opt_rx->saw_unknown = 1;
break;
+ default:
+ opt_rx->saw_unknown = 1;
}
ptr += opsize-2;
length -= opsize;
@@ -4151,10 +4236,13 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
}
/* When we get a reset we do this. */
-void tcp_reset(struct sock *sk)
+void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
trace_tcp_receive_reset(sk);
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb);
+
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
@@ -4311,10 +4399,9 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
* The receiver remembers and reflects via DSACKs. Leverage the
* DSACK state and change the txhash to re-route speculatively.
*/
- if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) {
- sk_rethink_txhash(sk);
+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
+ sk_rethink_txhash(sk))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
- }
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -4363,7 +4450,8 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
sp[i] = sp[i + 1];
continue;
}
- this_sack++, swalk++;
+ this_sack++;
+ swalk++;
}
}
@@ -4840,7 +4928,8 @@ void tcp_data_ready(struct sock *sk)
int avail = tp->rcv_nxt - tp->copied_seq;
if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) &&
- !sock_flag(sk, SOCK_DONE))
+ !sock_flag(sk, SOCK_DONE) &&
+ tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss)
return;
sk->sk_data_ready(sk);
@@ -4853,7 +4942,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
int eaten;
if (sk_is_mptcp(sk))
- mptcp_incoming_options(sk, skb, &tp->rx_opt);
+ mptcp_incoming_options(sk, skb);
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
@@ -5277,12 +5366,6 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
return true;
}
-/* When incoming ACK allowed to free some skb from write_queue,
- * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
- * on the exit from tcp input handler.
- *
- * PROBLEM: sndbuf expansion does not work well with largesend.
- */
static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -5297,16 +5380,13 @@ static void tcp_new_space(struct sock *sk)
static void tcp_check_space(struct sock *sk)
{
- if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
- sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
- /* pairs with tcp_poll() */
- smp_mb();
- if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- tcp_new_space(sk);
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
- }
+ /* pairs with tcp_poll() */
+ smp_mb();
+ if (sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
@@ -5544,7 +5624,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
} else if (tcp_reset_check(sk, skb)) {
- tcp_reset(sk);
+ tcp_reset(sk, skb);
}
goto discard;
}
@@ -5580,7 +5660,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
}
if (rst_seq_match)
- tcp_reset(sk);
+ tcp_reset(sk, skb);
else {
/* Disable TFO if RST is out-of-order
* and no data has been received
@@ -5608,6 +5688,8 @@ syn_challenge:
goto discard;
}
+ bpf_skops_parse_hdr(sk, skb);
+
return true;
discard:
@@ -5766,6 +5848,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
+ } else {
+ tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
}
__tcp_ack_snd_check(sk, 0);
@@ -5816,7 +5900,7 @@ discard:
}
EXPORT_SYMBOL(tcp_rcv_established);
-void tcp_init_transfer(struct sock *sk, int bpf_op)
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -5837,8 +5921,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
tp->snd_cwnd_stamp = tcp_jiffies32;
- tcp_call_bpf(sk, bpf_op, 0, NULL);
- tcp_init_congestion_control(sk);
+ icsk->icsk_ca_initialized = 0;
+ bpf_skops_established(sk, bpf_op, skb);
+ if (!icsk->icsk_ca_initialized)
+ tcp_init_congestion_control(sk);
tcp_init_buffer_space(sk);
}
@@ -5856,7 +5942,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id(sk, skb);
}
- tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
@@ -6011,7 +6097,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (th->rst) {
- tcp_reset(sk);
+ tcp_reset(sk, skb);
goto discard;
}
@@ -6328,7 +6414,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} else {
tcp_try_undo_spurious_syn(sk);
tp->retrans_stamp = 0;
- tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
+ skb);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
smp_mb();
@@ -6438,7 +6525,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
if (sk_is_mptcp(sk))
- mptcp_incoming_options(sk, skb, &tp->rx_opt);
+ mptcp_incoming_options(sk, skb);
break;
}
fallthrough;
@@ -6452,7 +6539,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
- tcp_reset(sk);
+ tcp_reset(sk, skb);
return 1;
}
}
@@ -6617,13 +6704,27 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
{
if (tcp_sk(sk)->save_syn) {
u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
- u32 *copy;
+ struct saved_syn *saved_syn;
+ u32 mac_hdrlen;
+ void *base;
+
+ if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
+ base = skb_mac_header(skb);
+ mac_hdrlen = skb_mac_header_len(skb);
+ len += mac_hdrlen;
+ } else {
+ base = skb_network_header(skb);
+ mac_hdrlen = 0;
+ }
- copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
- if (copy) {
- copy[0] = len;
- memcpy(&copy[1], skb_network_header(skb), len);
- req->saved_syn = copy;
+ saved_syn = kmalloc(struct_size(saved_syn, data, len),
+ GFP_ATOMIC);
+ if (saved_syn) {
+ saved_syn->mac_hdrlen = mac_hdrlen;
+ saved_syn->network_hdrlen = skb_network_header_len(skb);
+ saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
+ memcpy(saved_syn->data, base, len);
+ req->saved_syn = saved_syn;
}
}
}
@@ -6719,18 +6820,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
- af_ops->init_req(req, sk, skb);
-
- if (security_inet_conn_request(sk, skb, req))
+ dst = af_ops->route_req(sk, skb, &fl, req);
+ if (!dst)
goto drop_and_free;
if (tmp_opt.tstamp_ok)
tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
- dst = af_ops->route_req(sk, &fl, req);
- if (!dst)
- goto drop_and_free;
-
if (!want_cookie && !isn) {
/* Kill the following clause, if you dislike this way. */
if (!net->ipv4.sysctl_tcp_syncookies &&
@@ -6762,6 +6858,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
+ tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
@@ -6770,7 +6867,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
- &foc, TCP_SYNACK_FASTOPEN);
+ &foc, TCP_SYNACK_FASTOPEN, skb);
/* Add the child socket directly into the accept queue */
if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
reqsk_fastopen_remove(fastopen_sk, req, false);
@@ -6788,7 +6885,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_timeout_init((struct sock *)req));
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
- TCP_SYNACK_COOKIE);
+ TCP_SYNACK_COOKIE,
+ skb);
if (want_cookie) {
reqsk_free(req);
return 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 592c73962723..777306b5bc22 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -575,7 +575,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
- * is already accepted it is treated as a connected one below.
+ * already accepted it is treated as a connected one below.
*/
if (fastopen && !fastopen->sk)
break;
@@ -965,26 +965,38 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
+ u8 tos;
/* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+ tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+ (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+ (inet_sk(sk)->tos & INET_ECN_MASK) :
+ inet_sk(sk)->tos;
+
+ if (!INET_ECN_is_capable(tos) &&
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
+ tos |= INET_ECN_ECT_0;
+
rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- rcu_dereference(ireq->ireq_opt));
+ rcu_dereference(ireq->ireq_opt),
+ tos);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -1433,9 +1445,15 @@ static void tcp_v4_init_req(struct request_sock *req,
}
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
+ struct sk_buff *skb,
struct flowi *fl,
- const struct request_sock *req)
+ struct request_sock *req)
{
+ tcp_v4_init_req(req, sk, skb);
+
+ if (security_inet_conn_request(sk, skb, req))
+ return NULL;
+
return inet_csk_route_req(sk, &fl->u.ip4, req);
}
@@ -1455,7 +1473,6 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.req_md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
#endif
- .init_req = tcp_v4_init_req,
#ifdef CONFIG_SYN_COOKIES
.cookie_init_seq = cookie_v4_init_sequence,
#endif
@@ -1492,6 +1509,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
bool *own_req)
{
struct inet_request_sock *ireq;
+ bool found_dup_sk = false;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
@@ -1529,6 +1547,12 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = prandom_u32();
+ /* Set ToS of the new socket based upon the value of incoming SYN.
+ * ECT bits are set later in tcp_init_transfer().
+ */
+ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+ newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
if (!dst)
@@ -1565,12 +1589,22 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
+ &found_dup_sk);
if (likely(*own_req)) {
tcp_move_syn(newtp, req);
ireq->ireq_opt = NULL;
} else {
newinet->inet_opt = NULL;
+
+ if (!req_unhash && found_dup_sk) {
+ /* This code path should only be executed in the
+ * syncookie case only
+ */
+ bh_unlock_sock(newsk);
+ sock_put(newsk);
+ newsk = NULL;
+ }
}
return newsk;
@@ -1726,6 +1760,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
+ u32 tail_gso_size, tail_gso_segs;
struct skb_shared_info *shinfo;
const struct tcphdr *th;
struct tcphdr *thtail;
@@ -1733,6 +1768,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
unsigned int hdrlen;
bool fragstolen;
u32 gso_segs;
+ u32 gso_size;
int delta;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
@@ -1758,13 +1794,6 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
*/
th = (const struct tcphdr *)skb->data;
hdrlen = th->doff * 4;
- shinfo = skb_shinfo(skb);
-
- if (!shinfo->gso_size)
- shinfo->gso_size = skb->len - hdrlen;
-
- if (!shinfo->gso_segs)
- shinfo->gso_segs = 1;
tail = sk->sk_backlog.tail;
if (!tail)
@@ -1787,6 +1816,15 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
goto no_coalesce;
__skb_pull(skb, hdrlen);
+
+ shinfo = skb_shinfo(skb);
+ gso_size = shinfo->gso_size ?: skb->len;
+ gso_segs = shinfo->gso_segs ?: 1;
+
+ shinfo = skb_shinfo(tail);
+ tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
+ tail_gso_segs = shinfo->gso_segs ?: 1;
+
if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -1813,11 +1851,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
}
/* Not as strict as GRO. We only need to carry mss max value */
- skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
- skb_shinfo(tail)->gso_size);
-
- gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
- skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+ shinfo->gso_size = max(gso_size, tail_gso_size);
+ shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
sk->sk_backlog.len += delta;
__NET_INC_STATS(sock_net(sk),
@@ -2730,6 +2765,20 @@ void tcp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
+/* @wake is one when sk_stream_write_space() calls us.
+ * This sends EPOLLOUT only if notsent_bytes is half the limit.
+ * This mimics the strategy used in sock_def_write_space().
+ */
+bool tcp_stream_memory_free(const struct sock *sk, int wake)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 notsent_bytes = READ_ONCE(tp->write_seq) -
+ READ_ONCE(tp->snd_nxt);
+
+ return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
+}
+EXPORT_SYMBOL(tcp_stream_memory_free);
+
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 8c643a4ffad1..e6459537d4d2 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -89,6 +89,7 @@ struct lp {
/**
* tcp_lp_init
+ * @sk: socket to initialize congestion control algorithm for
*
* Init all required variables.
* Clone the handling from Vegas module implementation.
@@ -111,6 +112,7 @@ static void tcp_lp_init(struct sock *sk)
/**
* tcp_lp_cong_avoid
+ * @sk: socket to avoid congesting
*
* Implementation of cong_avoid.
* Will only call newReno CA when away from inference.
@@ -126,6 +128,7 @@ static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/**
* tcp_lp_remote_hz_estimator
+ * @sk: socket which needs an estimate for the remote HZs
*
* Estimate remote HZ.
* We keep on updating the estimated value, where original TCP-LP
@@ -176,6 +179,7 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
/**
* tcp_lp_owd_calculator
+ * @sk: socket to calculate one way delay for
*
* Calculate one way delay (in relative format).
* Original implement OWD as minus of remote time difference to local time
@@ -210,6 +214,8 @@ static u32 tcp_lp_owd_calculator(struct sock *sk)
/**
* tcp_lp_rtt_sample
+ * @sk: socket to add a rtt sample to
+ * @rtt: round trip time, which is ignored!
*
* Implementation or rtt_sample.
* Will take the following action,
@@ -254,6 +260,7 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
/**
* tcp_lp_pkts_acked
+ * @sk: socket requiring congestion avoidance calculations
*
* Implementation of pkts_acked.
* Deal with active drop under Early Congestion Indication.
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 279db8822439..0588b004ddac 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -887,7 +887,7 @@ static void tcp_metrics_flush_all(struct net *net)
pp = &hb->chain;
for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
match = net ? net_eq(tm_net(tm), net) :
- !refcount_read(&tm_net(tm)->count);
+ !refcount_read(&tm_net(tm)->ns.count);
if (match) {
*pp = tm->tcpm_next;
kfree_rcu(tm, rcu_head);
@@ -943,7 +943,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
return 0;
}
-static const struct genl_ops tcp_metrics_nl_ops[] = {
+static const struct genl_small_ops tcp_metrics_nl_ops[] = {
{
.cmd = TCP_METRICS_CMD_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -966,8 +966,8 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = {
.policy = tcp_metrics_nl_policy,
.netnsok = true,
.module = THIS_MODULE,
- .ops = tcp_metrics_nl_ops,
- .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
+ .small_ops = tcp_metrics_nl_ops,
+ .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
};
static unsigned int tcpmhash_entries;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 495dda2449fe..0055ae0a3bf8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -801,7 +801,7 @@ embryonic_reset:
req->rsk_ops->send_reset(sk, skb);
} else if (fastopen) { /* received a valid RST pkt */
reqsk_fastopen_remove(sk, req, true);
- tcp_reset(sk);
+ tcp_reset(sk, skb);
}
if (!fastopen) {
inet_csk_reqsk_queue_drop(sk, req);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 85ff417bda7f..8478cf749821 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -438,20 +438,161 @@ struct tcp_out_options {
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 hash_size; /* bytes in hash_location */
+ u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
struct mptcp_out_options mptcp;
};
-static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
+static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp,
+ struct tcp_out_options *opts)
{
#if IS_ENABLED(CONFIG_MPTCP)
if (unlikely(OPTION_MPTCP & opts->options))
- mptcp_write_options(ptr, &opts->mptcp);
+ mptcp_write_options(ptr, tp, &opts->mptcp);
#endif
}
+#ifdef CONFIG_CGROUP_BPF
+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
+ enum tcp_synack_type synack_type)
+{
+ if (unlikely(!skb))
+ return BPF_WRITE_HDR_TCP_CURRENT_MSS;
+
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE))
+ return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
+
+ return 0;
+}
+
+/* req, syn_skb and synack_type are used when writing synack */
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
+ !*remaining)
+ return;
+
+ /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
+
+ /* init sock_ops */
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
+
+ if (req) {
+ /* The listen "sk" cannot be passed here because
+ * it is not locked. It would not make too much
+ * sense to do bpf_setsockopt(listen_sk) based
+ * on individual connection request also.
+ *
+ * Thus, "req" is passed here and the cgroup-bpf-progs
+ * of the listen "sk" will be run.
+ *
+ * "req" is also used here for fastopen even the "sk" here is
+ * a fullsock "child" sk. It is to keep the behavior
+ * consistent between fastopen and non-fastopen on
+ * the bpf programming side.
+ */
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = *remaining;
+ /* tcp_current_mss() does not pass a skb */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err || sock_ops.remaining_opt_len == *remaining)
+ return;
+
+ opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
+ /* round up to 4 bytes */
+ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
+
+ *remaining -= opts->bpf_opt_len;
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+ u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!max_opt_len))
+ return;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+
+ if (req) {
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = max_opt_len;
+ first_opt_off = tcp_hdrlen(skb) - max_opt_len;
+ bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err)
+ nr_written = 0;
+ else
+ nr_written = max_opt_len - sock_ops.remaining_opt_len;
+
+ if (nr_written < max_opt_len)
+ memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
+ max_opt_len - nr_written);
+}
+#else
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+}
+#endif
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -561,7 +702,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
smc_options_write(ptr, &options);
- mptcp_options_write(ptr, opts);
+ mptcp_options_write(ptr, tp, opts);
}
static void smc_set_option(const struct tcp_sock *tp,
@@ -691,6 +832,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -701,7 +844,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
struct tcp_out_options *opts,
const struct tcp_md5sig_key *md5,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -758,6 +902,9 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+ bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
+ synack_type, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -826,6 +973,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
}
+ if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+ size = MAX_TCP_OPTION_SPACE - remaining;
+ }
+
return size;
}
@@ -883,9 +1039,9 @@ static void tcp_tsq_handler(struct sock *sk)
* transferring tsq->head because tcp_wfree() might
* interrupt us (non NAPI drivers)
*/
-static void tcp_tasklet_func(unsigned long data)
+static void tcp_tasklet_func(struct tasklet_struct *t)
{
- struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+ struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet);
LIST_HEAD(list);
unsigned long flags;
struct list_head *q, *n;
@@ -970,9 +1126,7 @@ void __init tcp_tasklet_init(void)
struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
INIT_LIST_HEAD(&tsq->head);
- tasklet_init(&tsq->tasklet,
- tcp_tasklet_func,
- (unsigned long)tsq);
+ tasklet_setup(&tsq->tasklet, tcp_tasklet_func);
}
}
@@ -1193,7 +1347,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
}
}
- tcp_options_write((__be32 *)(th + 1), tp, &opts);
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
th->window = htons(tcp_select_window(sk));
@@ -1204,6 +1357,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
*/
th->window = htons(min(tp->rcv_wnd, 65535U));
}
+
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
+
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
@@ -1213,6 +1369,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
}
#endif
+ /* BPF prog is the last one writing header option */
+ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
+
INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
tcp_v6_send_check, tcp_v4_send_check,
sk, skb);
@@ -1411,6 +1570,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
if (!buff)
return -ENOMEM; /* We'll just try again later. */
skb_copy_decrypted(buff, skb);
+ mptcp_skb_ext_copy(buff, skb);
sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
@@ -1524,7 +1684,6 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
skb->truesize -= delta_truesize;
sk_wmem_queued_add(sk, -delta_truesize);
sk_mem_uncharge(sk, delta_truesize);
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
}
/* Any change of skb->len requires recalculation of tso factor. */
@@ -1723,7 +1882,8 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
* window, and remember whether we were cwnd-limited then.
*/
if (!before(tp->snd_una, tp->max_packets_seq) ||
- tp->packets_out > tp->max_packets_out) {
+ tp->packets_out > tp->max_packets_out ||
+ is_cwnd_limited) {
tp->max_packets_out = tp->packets_out;
tp->max_packets_seq = tp->snd_nxt;
tp->is_cwnd_limited = is_cwnd_limited;
@@ -1966,6 +2126,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (unlikely(!buff))
return -ENOMEM;
skb_copy_decrypted(buff, skb);
+ mptcp_skb_ext_copy(buff, skb);
sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
@@ -2236,6 +2397,7 @@ static int tcp_mtu_probe(struct sock *sk)
skb = tcp_send_head(sk);
skb_copy_decrypted(nskb, skb);
+ mptcp_skb_ext_copy(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
@@ -2545,6 +2707,10 @@ repair:
else
tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
+ is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
+ if (likely(sent_pkts || is_cwnd_limited))
+ tcp_cwnd_validate(sk, is_cwnd_limited);
+
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
@@ -2552,8 +2718,6 @@ repair:
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk, false);
- is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
- tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
return !tp->packets_out && !tcp_write_queue_empty(sk);
@@ -3336,20 +3500,20 @@ int tcp_send_synack(struct sock *sk)
}
/**
- * tcp_make_synack - Prepare a SYN-ACK.
- * sk: listener socket
- * dst: dst entry attached to the SYNACK
- * req: request_sock pointer
- * foc: cookie for tcp fast open
- * synack_type: Type of synback to prepare
- *
- * Allocate one skb and build a SYNACK packet.
- * @dst is consumed : Caller should not use it again.
+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
+ * @sk: listener socket
+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
+ * should not use it again.
+ * @req: request_sock pointer
+ * @foc: cookie for tcp fast open
+ * @synack_type: Type of synack to prepare
+ * @syn_skb: SYN packet just received. It could be NULL for rtx case.
*/
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -3408,8 +3572,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
+ /* bpf program will be interested in the tcp_flags */
+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
- foc, synack_type) + sizeof(*th);
+ foc, synack_type,
+ syn_skb) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -3441,6 +3608,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_unlock();
#endif
+ bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
+ synack_type, &opts);
+
skb->skb_mstamp_ns = now;
tcp_add_tx_delay(skb, tp);
@@ -3741,16 +3911,15 @@ void tcp_send_delayed_ack(struct sock *sk)
ato = min(ato, max_ato);
}
+ ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
+
/* Stay within the limit we were given */
timeout = jiffies + ato;
/* Use new timeout only if there wasn't a older one earlier. */
if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
- /* If delack timer was blocked or is about to expire,
- * send ACK now.
- */
- if (icsk->icsk_ack.blocked ||
- time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
+ /* If delack timer is about to expire, send ACK now. */
+ if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
@@ -3779,10 +3948,15 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
buff = alloc_skb(MAX_TCP_HEADER,
sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (unlikely(!buff)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long delay;
+
+ delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
+ if (delay < TCP_RTO_MAX)
+ icsk->icsk_ack.retry++;
inet_csk_schedule_ack(sk);
- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX, TCP_RTO_MAX);
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
return;
}
@@ -3910,6 +4084,7 @@ void tcp_send_probe0(struct sock *sk)
/* Cancel probe timer, if it is not required. */
icsk->icsk_probes_out = 0;
icsk->icsk_backoff = 0;
+ icsk->icsk_probes_tstamp = 0;
return;
}
@@ -3924,6 +4099,8 @@ void tcp_send_probe0(struct sock *sk)
*/
timeout = TCP_RESOURCE_PROBE_INTERVAL;
}
+
+ timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
}
@@ -3934,7 +4111,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
int res;
tcp_rsk(req)->txhash = net_tx_rndhash();
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+ NULL);
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index fdb715bdd2d1..6f1b4ac7fe99 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -2,20 +2,6 @@
#include <linux/tcp.h>
#include <net/tcp.h>
-void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- /* Account for retransmits that are lost again */
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
- tcp_skb_pcount(skb));
- }
-}
-
static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
{
return t1 > t2 || (t1 == t2 && after(seq1, seq2));
@@ -110,13 +96,13 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
}
}
-void tcp_rack_mark_lost(struct sock *sk)
+bool tcp_rack_mark_lost(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout;
if (!tp->rack.advanced)
- return;
+ return false;
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
@@ -126,6 +112,7 @@ void tcp_rack_mark_lost(struct sock *sk)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
timeout, inet_csk(sk)->icsk_rto);
}
+ return !!timeout;
}
/* Record the most recently (re)sent time among the (s)acked packets
@@ -167,6 +154,7 @@ void tcp_rack_reo_timeout(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout, prior_inflight;
+ u32 lost = tp->lost;
prior_inflight = tcp_packets_in_flight(tp);
tcp_rack_detect_loss(sk, &timeout);
@@ -174,7 +162,7 @@ void tcp_rack_reo_timeout(struct sock *sk)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
tcp_enter_recovery(sk, false);
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
- tcp_cwnd_reduction(sk, 1, 0);
+ tcp_cwnd_reduction(sk, 1, tp->lost - lost, 0);
}
tcp_xmit_retransmit_queue(sk);
}
@@ -246,6 +234,6 @@ void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
mss, mss, GFP_ATOMIC);
- tcp_skb_mark_lost_uncond_verify(tp, skb);
+ tcp_mark_skb_lost(sk, skb);
}
}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 6cebf412d590..5842081bc8a2 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -10,7 +10,7 @@
#include <net/tcp.h>
/* These factors derived from the recommended values in the aer:
- * .01 and and 7/8.
+ * .01 and 7/8.
*/
#define TCP_SCALABLE_AI_CNT 100U
#define TCP_SCALABLE_MD_SCALE 3
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0c08c420fbc2..4ef08079ccfa 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -40,6 +40,24 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining));
}
+u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 remaining;
+ s32 elapsed;
+
+ if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp)
+ return when;
+
+ elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
+ if (unlikely(elapsed < 0))
+ elapsed = 0;
+ remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed;
+ remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);
+
+ return min_t(u32, remaining, when);
+}
+
/**
* tcp_write_err() - close socket and save error info
* @sk: The socket the error has appeared on.
@@ -219,14 +237,8 @@ static int tcp_write_timeout(struct sock *sk)
int retry_until;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
- if (icsk->icsk_retransmits) {
- dst_negative_advice(sk);
- } else {
- sk_rethink_txhash(sk);
- tp->timeout_rehash++;
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPTIMEOUTREHASH);
- }
+ if (icsk->icsk_retransmits)
+ __dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
expired = icsk->icsk_retransmits >= retry_until;
} else {
@@ -234,12 +246,7 @@ static int tcp_write_timeout(struct sock *sk)
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
- dst_negative_advice(sk);
- } else {
- sk_rethink_txhash(sk);
- tp->timeout_rehash++;
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPTIMEOUTREHASH);
+ __dst_negative_advice(sk);
}
retry_until = net->ipv4.sysctl_tcp_retries2;
@@ -270,6 +277,11 @@ static int tcp_write_timeout(struct sock *sk)
return 1;
}
+ if (sk_rethink_txhash(sk)) {
+ tp->timeout_rehash++;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
+ }
+
return 0;
}
@@ -331,7 +343,6 @@ static void tcp_delack_timer(struct timer_list *t)
if (!sock_owned_by_user(sk)) {
tcp_delack_timer_handler(sk);
} else {
- icsk->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
@@ -350,6 +361,7 @@ static void tcp_probe_timer(struct sock *sk)
if (tp->packets_out || !skb) {
icsk->icsk_probes_out = 0;
+ icsk->icsk_probes_tstamp = 0;
return;
}
@@ -361,13 +373,12 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
- if (icsk->icsk_user_timeout) {
- u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out,
- tcp_probe0_base(sk));
-
- if (elapsed >= icsk->icsk_user_timeout)
- goto abort;
- }
+ if (!icsk->icsk_probes_tstamp)
+ icsk->icsk_probes_tstamp = tcp_jiffies32;
+ else if (icsk->icsk_user_timeout &&
+ (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
+ msecs_to_jiffies(icsk->icsk_user_timeout))
+ goto abort;
max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 3f51e781562a..c8003c8aad2c 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -293,10 +293,10 @@ size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
const struct vegas *ca = inet_csk_ca(sk);
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- info->vegas.tcpv_enabled = ca->doing_vegas_now,
- info->vegas.tcpv_rttcnt = ca->cntRTT,
- info->vegas.tcpv_rtt = ca->baseRTT,
- info->vegas.tcpv_minrtt = ca->minRTT,
+ info->vegas.tcpv_enabled = ca->doing_vegas_now;
+ info->vegas.tcpv_rttcnt = ca->cntRTT;
+ info->vegas.tcpv_rtt = ca->baseRTT;
+ info->vegas.tcpv_minrtt = ca->minRTT;
*attr = INET_DIAG_VEGASINFO;
return sizeof(struct tcpvegas_info);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e88efba07551..69ea76578abb 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -541,7 +541,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
inet_sdif(skb), udptable, skb);
}
-struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
+struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport)
{
const struct iphdr *iph = ip_hdr(skb);
@@ -550,7 +550,6 @@ struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
iph->daddr, dport, inet_iif(skb),
inet_sdif(skb), &udp_table, NULL);
}
-EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
/* Must be called under rcu_read_lock().
* Does increment socket refcount.
@@ -702,7 +701,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
iph->saddr, uh->source, skb->dev->ifindex,
inet_sdif(skb), udptable, NULL);
- if (!sk) {
+ if (!sk || udp_sk(sk)->encap_type) {
/* No socket for error: try tunnels before discarding */
sk = ERR_PTR(-ENOENT);
if (static_branch_unlikely(&udp_encap_needed_key)) {
@@ -874,7 +873,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
struct udphdr *uh;
- int err = 0;
+ int err;
int is_udplite = IS_UDPLITE(sk);
int offset = skb_transport_offset(skb);
int len = skb->len - offset;
@@ -1170,7 +1169,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.oif = inet->uc_index;
} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
/* oif is set, packet is to local broadcast and
- * and uc_index is set. oif is most likely set
+ * uc_index is set. oif is most likely set
* by sk_bound_dev_if. If uc_index != oif check if the
* oif is an L3 master and uc_index is an L3 slave.
* If so, we want to allow the send using the uc_index.
@@ -1197,7 +1196,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
faddr, saddr, dport, inet->inet_sport,
sk->sk_uid);
- security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
@@ -2038,6 +2037,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (rc == -ENOMEM)
UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
+ else
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
+ is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
trace_udp_fail_queue_rcv_skb(rc, sk);
@@ -2173,7 +2175,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
__skb_pull(skb, skb_transport_offset(skb));
ret = udp_queue_rcv_one_skb(sk, skb);
if (ret > 0)
- ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret);
+ ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
}
return 0;
}
@@ -2553,7 +2555,8 @@ int udp_v4_early_demux(struct sk_buff *skb)
*/
if (!inet_sk(sk)->inet_daddr && in_dev)
return ip_mc_validate_source(skb, iph->daddr,
- iph->saddr, iph->tos,
+ iph->saddr,
+ iph->tos & IPTOS_RT_MASK,
skb->dev, in_dev, &itag);
}
return 0;
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index eddd973e6575..7a94791efc1a 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -22,10 +22,9 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
prot->close = sock_map_close;
}
-static void udp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
{
- if (sk->sk_family == AF_INET6 &&
- unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) {
+ if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) {
spin_lock_bh(&udpv6_prot_lock);
if (likely(ops != udpv6_prot_saved)) {
udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops);
@@ -46,8 +45,8 @@ struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
{
int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
- if (!psock->sk_proto)
- udp_bpf_check_v6_needs_rebuild(sk, READ_ONCE(sk->sk_prot));
+ if (sk->sk_family == AF_INET6)
+ udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
return &udp_bpf_prots[family];
}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 1dbece34496e..b2cee9a307d4 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -30,7 +30,7 @@ static int udp_dump_one(struct udp_table *tbl,
const struct inet_diag_req_v2 *req)
{
struct sk_buff *in_skb = cb->skb;
- int err = -EINVAL;
+ int err;
struct sock *sk = NULL;
struct sk_buff *rep;
struct net *net = sock_net(in_skb->sk);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index e67a66fbf27b..cfc872689b99 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -49,6 +49,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
__skb_pull(skb, tnl_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb_set_transport_header(skb, skb_inner_transport_offset(skb));
skb->mac_len = skb_inner_network_offset(skb);
skb->protocol = new_protocol;
@@ -67,6 +68,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
(NETIF_F_HW_CSUM | NETIF_F_IP_CSUM))));
features &= skb->dev->hw_enc_features;
+ /* CRC checksum can't be handled by HW when it's a UDP tunneling packet. */
+ features &= ~NETIF_F_SCTP_CRC;
/* The only checksum offload we care about from here on out is the
* outer one so strip the existing checksum feature flags and
@@ -184,8 +187,67 @@ out_unlock:
}
EXPORT_SYMBOL(skb_udp_tunnel_segment);
+static void __udpv4_gso_segment_csum(struct sk_buff *seg,
+ __be32 *oldip, __be32 *newip,
+ __be16 *oldport, __be16 *newport)
+{
+ struct udphdr *uh;
+ struct iphdr *iph;
+
+ if (*oldip == *newip && *oldport == *newport)
+ return;
+
+ uh = udp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ if (uh->check) {
+ inet_proto_csum_replace4(&uh->check, seg, *oldip, *newip,
+ true);
+ inet_proto_csum_replace2(&uh->check, seg, *oldport, *newport,
+ false);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+ *oldport = *newport;
+
+ csum_replace4(&iph->check, *oldip, *newip);
+ *oldip = *newip;
+}
+
+static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs)
+{
+ struct sk_buff *seg;
+ struct udphdr *uh, *uh2;
+ struct iphdr *iph, *iph2;
+
+ seg = segs;
+ uh = udp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ if ((udp_hdr(seg)->dest == udp_hdr(seg->next)->dest) &&
+ (udp_hdr(seg)->source == udp_hdr(seg->next)->source) &&
+ (ip_hdr(seg)->daddr == ip_hdr(seg->next)->daddr) &&
+ (ip_hdr(seg)->saddr == ip_hdr(seg->next)->saddr))
+ return segs;
+
+ while ((seg = seg->next)) {
+ uh2 = udp_hdr(seg);
+ iph2 = ip_hdr(seg);
+
+ __udpv4_gso_segment_csum(seg,
+ &iph2->saddr, &iph->saddr,
+ &uh2->source, &uh->source);
+ __udpv4_gso_segment_csum(seg,
+ &iph2->daddr, &iph->daddr,
+ &uh2->dest, &uh->dest);
+ }
+
+ return segs;
+}
+
static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb,
- netdev_features_t features)
+ netdev_features_t features,
+ bool is_ipv6)
{
unsigned int mss = skb_shinfo(skb)->gso_size;
@@ -195,11 +257,11 @@ static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb,
udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss);
- return skb;
+ return is_ipv6 ? skb : __udpv4_gso_segment_list_csum(skb);
}
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
- netdev_features_t features)
+ netdev_features_t features, bool is_ipv6)
{
struct sock *sk = gso_skb->sk;
unsigned int sum_truesize = 0;
@@ -211,7 +273,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
__be16 newlen;
if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST)
- return __udp_gso_segment_list(gso_skb, features);
+ return __udp_gso_segment_list(gso_skb, features, is_ipv6);
mss = skb_shinfo(gso_skb)->gso_size;
if (gso_skb->len <= sizeof(*uh) + mss)
@@ -325,7 +387,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
goto out;
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
- return __udp_gso_segment(skb, features);
+ return __udp_gso_segment(skb, features, false);
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
@@ -366,7 +428,7 @@ out:
static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
struct sk_buff *skb)
{
- struct udphdr *uh = udp_hdr(skb);
+ struct udphdr *uh = udp_gro_udphdr(skb);
struct sk_buff *pp = NULL;
struct udphdr *uh2;
struct sk_buff *p;
@@ -500,12 +562,22 @@ out:
}
EXPORT_SYMBOL(udp_gro_receive);
+static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
+ __be16 dport)
+{
+ const struct iphdr *iph = skb_gro_network_header(skb);
+
+ return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
+ iph->daddr, dport, inet_iif(skb),
+ inet_sdif(skb), &udp_table, NULL);
+}
+
INDIRECT_CALLABLE_SCOPE
struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sock *sk = NULL;
struct sk_buff *pp;
- struct sock *sk;
if (unlikely(!uh))
goto flush;
@@ -523,7 +595,10 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 0;
rcu_read_lock();
- sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL;
+
+ if (static_branch_unlikely(&udp_encap_needed_key))
+ sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest);
+
pp = udp_gro_receive(head, skb, uh, sk);
rcu_read_unlock();
return pp;
@@ -551,8 +626,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
{
__be16 newlen = htons(skb->len - nhoff);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- int err = -ENOSYS;
struct sock *sk;
+ int err;
uh->len = newlen;
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index 69962165c0e8..0d122edc368d 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -19,8 +19,9 @@ enum udp_tunnel_nic_table_entry_flags {
struct udp_tunnel_nic_table_entry {
__be16 port;
u8 type;
- u8 use_cnt;
u8 flags;
+ u16 use_cnt;
+#define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX
u8 hw_priv;
};
@@ -370,6 +371,8 @@ udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn,
bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
unsigned int from, to;
+ WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX);
+
/* If not going from used to unused or vice versa - all done.
* For dodgy entries make sure we try to sync again (queue the entry).
*/
@@ -675,6 +678,7 @@ static void
udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
{
const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic_shared_node *node;
unsigned int i, j;
/* Freeze all the ports we are already tracking so that the replay
@@ -686,7 +690,12 @@ udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
utn->missed = 0;
utn->need_replay = 0;
- udp_tunnel_get_rx_info(dev);
+ if (!info->shared) {
+ udp_tunnel_get_rx_info(dev);
+ } else {
+ list_for_each_entry(node, &info->shared->devices, list)
+ udp_tunnel_get_rx_info(node->dev);
+ }
for (i = 0; i < utn->n_tables; i++)
for (j = 0; j < info->tables[i].n_entries; j++)
@@ -742,20 +751,39 @@ err_free_utn:
return NULL;
}
+static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn)
+{
+ unsigned int i;
+
+ for (i = 0; i < utn->n_tables; i++)
+ kfree(utn->entries[i]);
+ kfree(utn->entries);
+ kfree(utn);
+}
+
static int udp_tunnel_nic_register(struct net_device *dev)
{
const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic_shared_node *node = NULL;
struct udp_tunnel_nic *utn;
unsigned int n_tables, i;
BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE <
UDP_TUNNEL_NIC_MAX_TABLES);
+ /* Expect use count of at most 2 (IPv4, IPv6) per device */
+ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX <
+ UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2);
+ /* Check that the driver info is sane */
if (WARN_ON(!info->set_port != !info->unset_port) ||
WARN_ON(!info->set_port == !info->sync_table) ||
WARN_ON(!info->tables[0].n_entries))
return -EINVAL;
+ if (WARN_ON(info->shared &&
+ info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
+ return -EINVAL;
+
n_tables = 1;
for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) {
if (!info->tables[i].n_entries)
@@ -766,9 +794,33 @@ static int udp_tunnel_nic_register(struct net_device *dev)
return -EINVAL;
}
- utn = udp_tunnel_nic_alloc(info, n_tables);
- if (!utn)
- return -ENOMEM;
+ /* Create UDP tunnel state structures */
+ if (info->shared) {
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ node->dev = dev;
+ }
+
+ if (info->shared && info->shared->udp_tunnel_nic_info) {
+ utn = info->shared->udp_tunnel_nic_info;
+ } else {
+ utn = udp_tunnel_nic_alloc(info, n_tables);
+ if (!utn) {
+ kfree(node);
+ return -ENOMEM;
+ }
+ }
+
+ if (info->shared) {
+ if (!info->shared->udp_tunnel_nic_info) {
+ INIT_LIST_HEAD(&info->shared->devices);
+ info->shared->udp_tunnel_nic_info = utn;
+ }
+
+ list_add_tail(&node->list, &info->shared->devices);
+ }
utn->dev = dev;
dev_hold(dev);
@@ -783,7 +835,33 @@ static int udp_tunnel_nic_register(struct net_device *dev)
static void
udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
{
- unsigned int i;
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+
+ /* For a shared table remove this dev from the list of sharing devices
+ * and if there are other devices just detach.
+ */
+ if (info->shared) {
+ struct udp_tunnel_nic_shared_node *node, *first;
+
+ list_for_each_entry(node, &info->shared->devices, list)
+ if (node->dev == dev)
+ break;
+ if (node->dev != dev)
+ return;
+
+ list_del(&node->list);
+ kfree(node);
+
+ first = list_first_entry_or_null(&info->shared->devices,
+ typeof(*first), list);
+ if (first) {
+ udp_tunnel_drop_rx_info(dev);
+ utn->dev = first->dev;
+ goto release_dev;
+ }
+
+ info->shared->udp_tunnel_nic_info = NULL;
+ }
/* Flush before we check work, so we don't waste time adding entries
* from the work which we will boot immediately.
@@ -796,10 +874,8 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
if (utn->work_pending)
return;
- for (i = 0; i < utn->n_tables; i++)
- kfree(utn->entries[i]);
- kfree(utn->entries);
- kfree(utn);
+ udp_tunnel_nic_free(utn);
+release_dev:
dev->udp_tunnel_nic = NULL;
dev_put(dev);
}
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index dc19aff7c2e0..fb0648e7fb32 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -64,14 +64,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
- .priority = 3,
+ .priority = 4,
};
#if IS_ENABLED(CONFIG_IPV6)
static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
- .priority = 2,
+ .priority = 3,
};
#endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 01146b66d666..9edc5bb2d531 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1997,6 +1997,7 @@ EXPORT_SYMBOL(ipv6_chk_prefix);
* ipv6_dev_find - find the first device with a given source address.
* @net: the net namespace
* @addr: the source address
+ * @dev: used to find the L3 domain of interest
*
* The caller should be protected by RCU, or RTNL.
*/
@@ -2466,8 +2467,9 @@ static void addrconf_add_mroute(struct net_device *dev)
.fc_ifindex = dev->ifindex,
.fc_dst_len = 8,
.fc_flags = RTF_UP,
- .fc_type = RTN_UNICAST,
+ .fc_type = RTN_MULTICAST,
.fc_nlinfo.nl_net = dev_net(dev),
+ .fc_protocol = RTPROT_KERNEL,
};
ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
@@ -5022,8 +5024,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
return -EMSGSIZE;
if (args->netnsid >= 0 &&
- nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
+ nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
+ }
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
@@ -5054,8 +5058,10 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
return -EMSGSIZE;
if (args->netnsid >= 0 &&
- nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
+ nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
+ }
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 ||
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 9ebf3fe0d2b1..c70c192bc91b 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -191,6 +191,13 @@ static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt,
return -EAFNOSUPPORT;
}
+static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ kfree_skb(skb);
+ return -EAFNOSUPPORT;
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow,
.ipv6_route_input = eafnosupport_ipv6_route_input,
@@ -201,6 +208,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
.fib6_nh_init = eafnosupport_fib6_nh_init,
.ip6_del_rt = eafnosupport_ip6_del_rt,
+ .ipv6_fragment = eafnosupport_ipv6_fragment,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 642fc6ac13d2..8a22486cf270 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -306,7 +306,9 @@ static int ip6addrlbl_del(struct net *net,
/* add default label */
static int __net_init ip6addrlbl_net_init(struct net *net)
{
- int err = 0;
+ struct ip6addrlbl_entry *p = NULL;
+ struct hlist_node *n;
+ int err;
int i;
ADDRLABEL(KERN_DEBUG "%s\n", __func__);
@@ -315,14 +317,20 @@ static int __net_init ip6addrlbl_net_init(struct net *net)
INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head);
for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) {
- int ret = ip6addrlbl_add(net,
- ip6addrlbl_init_table[i].prefix,
- ip6addrlbl_init_table[i].prefixlen,
- 0,
- ip6addrlbl_init_table[i].label, 0);
- /* XXX: should we free all rules when we catch an error? */
- if (ret && (!err || err != -ENOMEM))
- err = ret;
+ err = ip6addrlbl_add(net,
+ ip6addrlbl_init_table[i].prefix,
+ ip6addrlbl_init_table[i].prefixlen,
+ 0,
+ ip6addrlbl_init_table[i].label, 0);
+ if (err)
+ goto err_ip6addrlbl_add;
+ }
+ return 0;
+
+err_ip6addrlbl_add:
+ hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
+ hlist_del_rcu(&p->list);
+ kfree_rcu(p, rcu);
}
return err;
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 0306509ab063..8e9c3e9ea36e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
- err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
+ err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr);
if (err)
return err;
@@ -661,6 +661,7 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
const struct proto_ops inet6_stream_ops = {
.family = PF_INET6,
+ .flags = PROTO_CMSG_DATA_ONLY,
.owner = THIS_MODULE,
.release = inet6_release,
.bind = inet6_bind,
@@ -818,7 +819,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
fl6.flowi6_uid = sk->sk_uid;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
rcu_read_lock();
final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
@@ -1026,6 +1027,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
.xfrm6_rcv_encap = xfrm6_rcv_encap,
#endif
.nd_tbl = &nd_tbl,
+ .ipv6_fragment = ip6_fragment,
};
static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index d88d97617f7e..440080da805b 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -588,7 +588,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
memset(ah->auth_data, 0, ahp->icv_trunc_len);
- if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN))
+ err = ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN);
+ if (err)
goto out_free;
ip6h->priority = 0;
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 8d3f66c310db..51184a70ac7e 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -423,7 +423,7 @@ static void calipso_doi_free_rcu(struct rcu_head *entry)
/**
* calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
* @doi: the DOI value
- * @audit_secid: the LSM secid to use in the audit message
+ * @audit_info: NetLabel audit information
*
* Description:
* Removes a DOI definition from the CALIPSO engine. The NetLabel routines will
@@ -761,7 +761,7 @@ static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len,
calipso[1] = len - 2;
*(__be32 *)(calipso + 2) = htonl(doi_def->doi);
calipso[6] = (len - CALIPSO_HDR_LEN) / 4;
- calipso[7] = secattr->attr.mls.lvl,
+ calipso[7] = secattr->attr.mls.lvl;
crc = ~crc_ccitt(0xffff, calipso, len);
calipso[8] = crc & 0xff;
calipso[9] = (crc >> 8) & 0xff;
@@ -1226,7 +1226,7 @@ static int calipso_req_setattr(struct request_sock *req,
/**
* calipso_req_delattr - Delete the CALIPSO option from a request socket
- * @reg: the request socket
+ * @req: the request socket
*
* Description:
* Removes the CALIPSO option from a request socket, if present.
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index cc8ad7ddecda..206f66310a88 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -60,7 +60,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr))
fl6->flowi6_oif = np->mcast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
}
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 52c2f063529f..2b804fcebcc6 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -478,7 +478,6 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
u8 *tail;
- u8 *vaddr;
int nfrags;
int esph_offset;
struct page *page;
@@ -519,14 +518,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
page = pfrag->page;
get_page(page);
- vaddr = kmap_atomic(page);
-
- tail = vaddr + pfrag->offset;
+ tail = page_address(page) + pfrag->offset;
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
- kunmap_atomic(vaddr);
-
nfrags = skb_shinfo(skb)->nr_frags;
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 374105e4394f..6126f8bf94b3 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -906,11 +906,6 @@ void ipv6_exthdrs_exit(void)
/*
* Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input().
*/
-static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb)
-{
- return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev);
-}
-
static inline struct net *ipv6_skb_net(struct sk_buff *skb)
{
return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index a4e4912ad607..f3d05866692e 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -158,7 +158,13 @@ static bool is_ineligible(const struct sk_buff *skb)
tp = skb_header_pointer(skb,
ptr+offsetof(struct icmp6hdr, icmp6_type),
sizeof(_type), &_type);
- if (!tp || !(*tp & ICMPV6_INFOMSG_MASK))
+
+ /* Based on RFC 8200, Section 4.5 Fragment Header, return
+ * false if this is a fragment packet with no icmp header info.
+ */
+ if (!tp && frag_off != 0)
+ return false;
+ else if (!tp || !(*tp & ICMPV6_INFOMSG_MASK))
return true;
}
return false;
@@ -314,10 +320,10 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st
{
struct icmpv6_msg *msg = (struct icmpv6_msg *) from;
struct sk_buff *org_skb = msg->skb;
- __wsum csum = 0;
+ __wsum csum;
csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset,
- to, len, csum);
+ to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
if (!(msg->type & ICMPV6_INFOMSG_MASK))
nf_ct_attach(skb, org_skb);
@@ -501,8 +507,11 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if (__ipv6_addr_needs_scope_id(addr_type)) {
iif = icmp6_iif(skb);
} else {
- dst = skb_dst(skb);
- iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
+ /*
+ * The source device is used for looking up which routing table
+ * to use for sending an ICMP error.
+ */
+ iif = l3mdev_master_ifindex(skb->dev);
}
/*
@@ -564,7 +573,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
fl6.fl6_icmp_code = code;
fl6.flowi6_uid = sock_net_uid(net, NULL);
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
np = inet6_sk(sk);
@@ -746,7 +755,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
fl6.flowi6_uid = sock_net_uid(net, NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
local_bh_disable();
sk = icmpv6_xmit_lock(net);
@@ -999,7 +1008,7 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
fl6->fl6_icmp_type = type;
fl6->fl6_icmp_code = 0;
fl6->flowi6_oif = oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
}
static void __net_exit icmpv6_sk_exit(struct net *net)
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index e315526fa244..5a9f4d722f35 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -46,7 +46,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
fl6->fl6_dport = ireq->ir_rmt_port;
fl6->fl6_sport = htons(ireq->ir_num);
fl6->flowi6_uid = sk->sk_uid;
- security_req_classify_flow(req, flowi6_to_flowi(fl6));
+ security_req_classify_flow(req, flowi6_to_flowi_common(fl6));
dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (IS_ERR(dst))
@@ -95,7 +95,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->fl6_sport = inet->inet_sport;
fl6->fl6_dport = inet->inet_dport;
fl6->flowi6_uid = sk->sk_uid;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
rcu_read_lock();
final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 2d3add9e6116..55c290d55605 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -94,7 +94,7 @@ EXPORT_SYMBOL(__inet6_lookup_established);
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum,
const struct in6_addr *daddr,
- const int dif, const int sdif, bool exact_dif)
+ const int dif, const int sdif)
{
int score = -1;
@@ -138,15 +138,13 @@ static struct sock *inet6_lhash2_lookup(struct net *net,
const __be16 sport, const struct in6_addr *daddr,
const unsigned short hnum, const int dif, const int sdif)
{
- bool exact_dif = inet6_exact_dif_match(net, skb);
struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
int score, hiscore = 0;
inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
sk = (struct sock *)icsk;
- score = compute_score(sk, net, hnum, daddr, dif, sdif,
- exact_dif);
+ score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
result = lookup_reuseport(net, sk, skb, doff,
saddr, sport, daddr, hnum);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4a664ad4f4d4..f43e27555725 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1025,6 +1025,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
{
struct fib6_table *table = rt->fib6_table;
+ /* Flush all cached dst in exception table */
+ rt6_flush_exceptions(rt);
fib6_drop_pcpu_from(rt, table);
if (rt->nh && !list_empty(&rt->nh_list))
@@ -1812,10 +1814,14 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
children = 0;
child = NULL;
- if (fn_r)
- child = fn_r, children |= 1;
- if (fn_l)
- child = fn_l, children |= 2;
+ if (fn_r) {
+ child = fn_r;
+ children |= 1;
+ }
+ if (fn_l) {
+ child = fn_l;
+ children |= 2;
+ }
if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
@@ -1923,9 +1929,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
- /* Flush all cached dst in exception table */
- rt6_flush_exceptions(rt);
-
/* Reset round-robin state, if necessary */
if (rcu_access_pointer(fn->rr_ptr) == rt)
fn->rr_ptr = NULL;
@@ -2618,8 +2621,10 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
iter->skip = *pos;
if (iter->tbl) {
+ loff_t p = 0;
+
ipv6_route_seq_setup_walk(iter, net);
- return ipv6_route_seq_next(seq, NULL, pos);
+ return ipv6_route_seq_next(seq, NULL, &p);
} else {
return NULL;
}
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 3a57fb9ce049..c3bc89b6b1a1 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -707,6 +707,17 @@ static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb,
return 0;
}
+static struct ip_tunnel_info *skb_tunnel_info_txcheck(struct sk_buff *skb)
+{
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
+ return ERR_PTR(-EINVAL);
+
+ return tun_info;
+}
+
static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
struct net_device *dev, __u8 dsfield,
struct flowi6 *fl6, int encap_limit,
@@ -734,10 +745,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
const struct ip_tunnel_key *key;
__be16 flags;
- tun_info = skb_tunnel_info(skb);
- if (unlikely(!tun_info ||
- !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
- ip_tunnel_info_af(tun_info) != AF_INET6))
+ tun_info = skb_tunnel_info_txcheck(skb);
+ if (IS_ERR(tun_info) ||
+ unlikely(ip_tunnel_info_af(tun_info) != AF_INET6))
return -EINVAL;
key = &tun_info->key;
@@ -908,7 +918,8 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
tx_err:
- stats->tx_errors++;
+ if (!t->parms.collect_md || !IS_ERR(skb_tunnel_info_txcheck(skb)))
+ stats->tx_errors++;
stats->tx_dropped++;
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -917,6 +928,7 @@ tx_err:
static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
struct net_device *dev)
{
+ struct ip_tunnel_info *tun_info = NULL;
struct ip6_tnl *t = netdev_priv(dev);
struct dst_entry *dst = skb_dst(skb);
struct net_device_stats *stats;
@@ -964,15 +976,13 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
* for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}.
*/
if (t->parms.collect_md) {
- struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
struct erspan_metadata *md;
__be32 tun_id;
- tun_info = skb_tunnel_info(skb);
- if (unlikely(!tun_info ||
- !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
- ip_tunnel_info_af(tun_info) != AF_INET6))
+ tun_info = skb_tunnel_info_txcheck(skb);
+ if (IS_ERR(tun_info) ||
+ unlikely(ip_tunnel_info_af(tun_info) != AF_INET6))
goto tx_err;
key = &tun_info->key;
@@ -1065,7 +1075,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
tx_err:
stats = &t->dev->stats;
- stats->tx_errors++;
+ if (!IS_ERR(tun_info))
+ stats->tx_errors++;
stats->tx_dropped++;
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -1122,8 +1133,13 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu,
return;
if (rt->dst.dev) {
- dev->needed_headroom = rt->dst.dev->hard_header_len +
- t_hlen;
+ unsigned short dst_len = rt->dst.dev->hard_header_len +
+ t_hlen;
+
+ if (t->dev->header_ops)
+ dev->hard_header_len = dst_len;
+ else
+ dev->needed_headroom = dst_len;
if (set_mtu) {
dev->mtu = rt->dst.dev->mtu - t_hlen;
@@ -1148,7 +1164,12 @@ static int ip6gre_calc_hlen(struct ip6_tnl *tunnel)
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
- tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
+
+ if (tunnel->dev->header_ops)
+ tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ else
+ tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
+
return t_hlen;
}
@@ -1380,7 +1401,7 @@ static const struct net_device_ops ip6gre_netdev_ops = {
.ndo_start_xmit = ip6gre_tunnel_xmit,
.ndo_do_ioctl = ip6gre_tunnel_ioctl,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1817,7 +1838,7 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1885,7 +1906,7 @@ static const struct net_device_ops ip6erspan_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c78e67d7747f..077d43af8226 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -125,8 +125,43 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
return -EINVAL;
}
+static int
+ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
+{
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features;
+ int ret = 0;
+
+ /* Please see corresponding comment in ip_finish_output_gso
+ * describing the cases where GSO segment length exceeds the
+ * egress MTU.
+ */
+ features = netif_skb_features(skb);
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ consume_skb(skb);
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ int err;
+
+ skb_mark_not_on_list(segs);
+ err = ip6_fragment(net, sk, segs, ip6_finish_output2);
+ if (err && ret == 0)
+ ret = err;
+ }
+
+ return ret;
+}
+
static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ unsigned int mtu;
+
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
@@ -135,7 +170,11 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff
}
#endif
- if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
+ mtu = ip6_skb_dst_mtu(skb);
+ if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
+ return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
+
+ if ((skb->len > mtu && !skb_is_gso(skb)) ||
dst_allfrag(skb_dst(skb)) ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
return ip6_fragment(net, sk, skb, ip6_finish_output2);
@@ -468,8 +507,6 @@ int ip6_forward(struct sk_buff *skb)
* check and decrement ttl
*/
if (hdr->hop_limit <= 1) {
- /* Force OUTPUT device used as source address */
- skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
@@ -1492,7 +1529,7 @@ emsgsize:
* Otherwise, we need to reserve fragment header and
* fragment alignment (= 8-15 octects, in total).
*
- * Note that we may need to "move" the data from the tail of
+ * Note that we may need to "move" the data from the tail
* of the buffer to the new fragment when we split
* the message.
*
@@ -1615,7 +1652,7 @@ alloc_new_skb:
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
- data + transhdrlen, fraggap, 0);
+ data + transhdrlen, fraggap);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
data += fraggap;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a0217e5bf3bc..a7950baa05e5 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -94,36 +94,6 @@ static inline int ip6_tnl_mpls_supported(void)
return IS_ENABLED(CONFIG_MPLS);
}
-static struct net_device_stats *ip6_get_stats(struct net_device *dev)
-{
- struct pcpu_sw_netstats tmp, sum = { 0 };
- int i;
-
- for_each_possible_cpu(i) {
- unsigned int start;
- const struct pcpu_sw_netstats *tstats =
- per_cpu_ptr(dev->tstats, i);
-
- do {
- start = u64_stats_fetch_begin_irq(&tstats->syncp);
- tmp.rx_packets = tstats->rx_packets;
- tmp.rx_bytes = tstats->rx_bytes;
- tmp.tx_packets = tstats->tx_packets;
- tmp.tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
-
- sum.rx_packets += tmp.rx_packets;
- sum.rx_bytes += tmp.rx_bytes;
- sum.tx_packets += tmp.tx_packets;
- sum.tx_bytes += tmp.tx_bytes;
- }
- dev->stats.rx_packets = sum.rx_packets;
- dev->stats.rx_bytes = sum.rx_bytes;
- dev->stats.tx_packets = sum.tx_packets;
- dev->stats.tx_bytes = sum.tx_bytes;
- return &dev->stats;
-}
-
#define for_each_ip6_tunnel_rcu(start) \
for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
@@ -204,6 +174,7 @@ ip6_tnl_lookup(struct net *net, int link,
/**
* ip6_tnl_bucket - get head of list matching given tunnel parameters
+ * @ip6n: the private data for ip6_vti in the netns
* @p: parameters containing tunnel end-points
*
* Description:
@@ -230,6 +201,7 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p)
/**
* ip6_tnl_link - add tunnel to hash table
+ * @ip6n: the private data for ip6_vti in the netns
* @t: tunnel to be added
**/
@@ -246,6 +218,7 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
/**
* ip6_tnl_unlink - remove tunnel from hash table
+ * @ip6n: the private data for ip6_vti in the netns
* @t: tunnel to be removed
**/
@@ -417,6 +390,7 @@ ip6_tnl_dev_uninit(struct net_device *dev)
/**
* parse_tvl_tnl_enc_lim - handle encapsulation limit option
* @skb: received socket buffer
+ * @raw: the ICMPv6 error message data
*
* Return:
* 0 if none was found,
@@ -485,14 +459,9 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
}
EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim);
-/**
- * ip6_tnl_err - tunnel error handler
- *
- * Description:
- * ip6_tnl_err() should handle errors in the tunnel according
- * to the specifications in RFC 2473.
- **/
-
+/* ip6_tnl_err() should handle errors in the tunnel according to the
+ * specifications in RFC 2473.
+ */
static int
ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
u8 *type, u8 *code, int *msg, __u32 *info, int offset)
@@ -1271,6 +1240,8 @@ route_lookup:
if (max_headroom > dev->needed_headroom)
dev->needed_headroom = max_headroom;
+ skb_set_inner_ipproto(skb, proto);
+
err = ip6_tnl_encap(skb, t, &proto, fl6);
if (err)
return err;
@@ -1280,8 +1251,6 @@ route_lookup:
ipv6_push_frag_opts(skb, &opt.ops, &proto);
}
- skb_set_inner_ipproto(skb, proto);
-
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
@@ -1835,7 +1804,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_start_xmit = ip6_tnl_start_xmit,
.ndo_do_ioctl = ip6_tnl_ioctl,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats = ip6_get_stats,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index fac01b80a104..0225fd694192 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -125,6 +125,7 @@ vti6_tnl_lookup(struct net *net, const struct in6_addr *remote,
/**
* vti6_tnl_bucket - get head of list matching given tunnel parameters
+ * @ip6n: the private data for ip6_vti in the netns
* @p: parameters containing tunnel end-points
*
* Description:
@@ -347,7 +348,6 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
{
unsigned short family;
struct net_device *dev;
- struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
const struct xfrm_mode *inner_mode;
struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
@@ -390,12 +390,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev)));
skb->dev = dev;
-
- tstats = this_cpu_ptr(dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_rx_add(dev, skb->len);
return 0;
}
@@ -895,7 +890,7 @@ static const struct net_device_ops vti6_netdev_ops = {
.ndo_uninit = vti6_dev_uninit,
.ndo_start_xmit = vti6_tnl_xmit,
.ndo_do_ioctl = vti6_ioctl,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 43a894bf9a1b..a6804a7e34c1 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1148,7 +1148,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (sk->sk_type != SOCK_STREAM)
return -ENOPROTOOPT;
- msg.msg_control = optval;
+ msg.msg_control_user = optval;
msg.msg_controllen = len;
msg.msg_flags = flags;
msg.msg_control_is_user = true;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 8cd2782a31e4..6c8604390266 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -548,7 +548,7 @@ done:
}
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
- struct sockaddr_storage *p)
+ struct sockaddr_storage __user *p)
{
int err, i, count, copycount;
const struct in6_addr *group;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 27f29b957ee7..76717478f173 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -81,6 +81,7 @@ static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
static int pndisc_constructor(struct pneigh_entry *n);
static void pndisc_destructor(struct pneigh_entry *n);
static void pndisc_redo(struct sk_buff *skb);
+static int ndisc_is_multicast(const void *pkey);
static const struct neigh_ops ndisc_generic_ops = {
.family = AF_INET6,
@@ -115,6 +116,7 @@ struct neigh_table nd_tbl = {
.pconstructor = pndisc_constructor,
.pdestructor = pndisc_destructor,
.proxy_redo = pndisc_redo,
+ .is_multicast = ndisc_is_multicast,
.allow_add = ndisc_allow_add,
.id = "ndisc_cache",
.parms = {
@@ -1706,6 +1708,11 @@ static void pndisc_redo(struct sk_buff *skb)
kfree_skb(skb);
}
+static int ndisc_is_multicast(const void *pkey)
+{
+ return ipv6_addr_is_multicast((struct in6_addr *)pkey);
+}
+
static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
{
struct inet6_dev *idev = __in6_dev_get(skb->dev);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 6d0e942d082d..ab9a279dd6d4 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -20,10 +20,10 @@
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include "../bridge/br_private.h"
-int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
+int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct sock *sk = sk_to_full_sk(skb->sk);
+ struct sock *sk = sk_to_full_sk(sk_partial);
unsigned int hh_len;
struct dst_entry *dst;
int strict = (ipv6_addr_type(&iph->daddr) &
@@ -84,7 +84,7 @@ static int nf_ip6_reroute(struct sk_buff *skb,
if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
!ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
skb->mark != rt_info->mark)
- return ip6_route_me_harder(entry->state.net, skb);
+ return ip6_route_me_harder(entry->state.net, entry->state.sk, skb);
}
return 0;
}
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 2e2119bfcf13..0d453fa9e327 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -280,7 +280,7 @@ ip6t_do_table(struct sk_buff *skb,
local_bh_disable();
addend = xt_write_recseq_begin();
- private = READ_ONCE(table->private); /* Address dependency. */
+ private = rcu_access_pointer(table->private);
cpu = smp_processor_id();
table_base = private->entries;
jumpstack = (struct ip6t_entry **)private->jumpstack[cpu];
@@ -807,7 +807,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(table);
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
@@ -831,7 +831,7 @@ copy_entries_to_user(unsigned int total_size,
unsigned int off, num;
const struct ip6t_entry *e;
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(table);
int ret = 0;
const void *loc_cpu_entry;
@@ -980,7 +980,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
t = xt_request_find_table_lock(net, AF_INET6, name);
if (!IS_ERR(t)) {
struct ip6t_getinfo info;
- const struct xt_table_info *private = t->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(t);
#ifdef CONFIG_COMPAT
struct xt_table_info tmp;
@@ -1035,7 +1035,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
t = xt_find_table_lock(net, AF_INET6, get.name);
if (!IS_ERR(t)) {
- struct xt_table_info *private = t->private;
+ struct xt_table_info *private = xt_table_get_private_protected(t);
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
@@ -1189,7 +1189,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
}
local_bh_disable();
- private = t->private;
+ private = xt_table_get_private_protected(t);
if (private->number != tmp.num_counters) {
ret = -EINVAL;
goto unlock_up_free;
@@ -1552,7 +1552,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
void __user *userptr)
{
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(table);
void __user *pos;
unsigned int size;
int ret = 0;
@@ -1598,7 +1598,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
xt_compat_lock(AF_INET6);
t = xt_find_table_lock(net, AF_INET6, get.name);
if (!IS_ERR(t)) {
- const struct xt_table_info *private = t->private;
+ const struct xt_table_info *private = xt_table_get_private_protected(t);
struct xt_table_info info;
ret = compat_table_info(private, &info);
if (!ret && get.size == info.size)
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
index 9ee077bf4f49..787c74aa85e3 100644
--- a/net/ipv6/netfilter/ip6t_NPT.c
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -77,16 +77,43 @@ static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt,
return true;
}
+static struct ipv6hdr *icmpv6_bounced_ipv6hdr(struct sk_buff *skb,
+ struct ipv6hdr *_bounced_hdr)
+{
+ if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+ return NULL;
+
+ if (!icmpv6_is_err(icmp6_hdr(skb)->icmp6_type))
+ return NULL;
+
+ return skb_header_pointer(skb,
+ skb_transport_offset(skb) + sizeof(struct icmp6hdr),
+ sizeof(struct ipv6hdr),
+ _bounced_hdr);
+}
+
static unsigned int
ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct ipv6hdr _bounced_hdr;
+ struct ipv6hdr *bounced_hdr;
+ struct in6_addr bounced_pfx;
if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) {
icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
offsetof(struct ipv6hdr, saddr));
return NF_DROP;
}
+
+ /* rewrite dst addr of bounced packet which was sent to dst range */
+ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr);
+ if (bounced_hdr) {
+ ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->daddr, npt->src_pfx_len);
+ if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0)
+ ip6t_npt_map_pfx(npt, &bounced_hdr->daddr);
+ }
+
return XT_CONTINUE;
}
@@ -94,12 +121,24 @@ static unsigned int
ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct ipv6hdr _bounced_hdr;
+ struct ipv6hdr *bounced_hdr;
+ struct in6_addr bounced_pfx;
if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) {
icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
offsetof(struct ipv6hdr, daddr));
return NF_DROP;
}
+
+ /* rewrite src addr of bounced packet which was sent from dst range */
+ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr);
+ if (bounced_hdr) {
+ ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->saddr, npt->src_pfx_len);
+ if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0)
+ ip6t_npt_map_pfx(npt, &bounced_hdr->saddr);
+ }
+
return XT_CONTINUE;
}
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 3ac5485049f0..a35019d2e480 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -61,7 +61,7 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
/* Do nothing */
break;
case IP6T_TCP_RESET:
- nf_send_reset6(net, skb, xt_hooknum(par));
+ nf_send_reset6(net, par->state->sk, skb, xt_hooknum(par));
break;
case IP6T_ICMP6_POLICY_FAIL:
nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par));
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 1a2748611e00..cee74803d7a1 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -57,7 +57,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
- err = ip6_route_me_harder(state->net, skb);
+ err = ip6_route_me_harder(state->net, state->sk, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index fed9666a2f7d..c129ad334eb3 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -355,6 +355,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
ipv6_hdr(skb)->payload_len = htons(payload_len);
ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
+ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
/* Yes, and fold redundant checksum back. 8) */
if (skb->ip_summed == CHECKSUM_COMPLETE)
@@ -439,6 +440,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
{
u16 savethdr = skb->transport_header;
+ u8 nexthdr = NEXTHDR_FRAGMENT;
int fhoff, nhoff, ret;
struct frag_hdr *fhdr;
struct frag_queue *fq;
@@ -454,6 +456,14 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
return 0;
+ /* Discard the first fragment if it does not include all headers
+ * RFC 8200, Section 4.5
+ */
+ if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) {
+ pr_debug("Drop incomplete fragment\n");
+ return 0;
+ }
+
if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
return -ENOMEM;
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index da64550a5707..8210ff34ed9b 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -297,9 +297,11 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m,
switch (dev->type) {
case ARPHRD_ETHER:
- nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
- eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
- ntohs(eth_hdr(skb)->h_proto));
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+ nf_log_dump_vlan(m, skb);
+ nf_log_buf_add(m, "MACPROTO=%04x ",
+ ntohs(eth_hdr(skb)->h_proto));
return;
default:
break;
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 4aef6baaa55e..dffeaaaadcde 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -12,6 +12,140 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
+static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int thoff;
+ __be16 fo;
+ u8 proto = ip6h->nexthdr;
+
+ if (skb_csum_unnecessary(skb))
+ return true;
+
+ if (ip6h->payload_len &&
+ pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
+ return false;
+
+ ip6h = ipv6_hdr(skb);
+ thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
+ if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
+ return false;
+
+ if (!nf_reject_verify_csum(proto))
+ return true;
+
+ return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
+}
+
+static int nf_reject_ip6hdr_validate(struct sk_buff *skb)
+{
+ struct ipv6hdr *hdr;
+ u32 pkt_len;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return 0;
+
+ pkt_len = ntohs(hdr->payload_len);
+ if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+ return 0;
+
+ return 1;
+}
+
+struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+ const struct tcphdr *oth;
+ struct tcphdr _oth;
+ unsigned int otcplen;
+ struct ipv6hdr *nip6h;
+
+ if (!nf_reject_ip6hdr_validate(oldskb))
+ return NULL;
+
+ oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook);
+ if (!oth)
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
+ net->ipv6.devconf_all->hop_limit);
+ nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen);
+ nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset);
+
+struct sk_buff *nf_reject_skb_v6_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *nip6h;
+ struct icmp6hdr *icmp6h;
+ unsigned int len;
+
+ if (!nf_reject_ip6hdr_validate(oldskb))
+ return NULL;
+
+ /* Include "As much of invoking packet as possible without the ICMPv6
+ * packet exceeding the minimum IPv6 MTU" in the ICMP payload.
+ */
+ len = min_t(unsigned int, 1220, oldskb->len);
+
+ if (!pskb_may_pull(oldskb, len))
+ return NULL;
+
+ if (!nf_reject_v6_csum_ok(oldskb, hook))
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) +
+ LL_MAX_HEADER + len, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6,
+ net->ipv6.devconf_all->hop_limit);
+
+ skb_reset_transport_header(nskb);
+ icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr));
+ icmp6h->icmp6_type = ICMPV6_DEST_UNREACH;
+ icmp6h->icmp6_code = code;
+
+ skb_put_data(nskb, skb_network_header(oldskb), len);
+ nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+ icmp6h->icmp6_cksum =
+ csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr,
+ nskb->len - sizeof(struct ipv6hdr),
+ IPPROTO_ICMPV6,
+ csum_partial(icmp6h,
+ nskb->len - sizeof(struct ipv6hdr),
+ 0));
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach);
+
const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
struct tcphdr *otcph,
unsigned int *otcplen, int hook)
@@ -141,7 +275,8 @@ static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in)
return 0;
}
-void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
+void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
+ int hook)
{
struct net_device *br_indev __maybe_unused;
struct sk_buff *nskb;
@@ -170,7 +305,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
fl6.fl6_sport = otcph->dest;
fl6.fl6_dport = otcph->source;
- if (hook == NF_INET_PRE_ROUTING) {
+ if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) {
nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
if (!dst)
return;
@@ -179,7 +314,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
- security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
dst_release(dst);
@@ -233,7 +368,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
dev_queue_xmit(nskb);
} else
#endif
- ip6_local_out(net, nskb->sk, nskb);
+ ip6_local_out(net, sk, nskb);
}
EXPORT_SYMBOL_GPL(nf_send_reset6);
@@ -268,7 +403,8 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in,
if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL)
skb_in->dev = net->loopback_dev;
- if (hooknum == NF_INET_PRE_ROUTING && nf_reject6_fill_skb_dst(skb_in))
+ if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) &&
+ nf_reject6_fill_skb_dst(skb_in) < 0)
return;
icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0);
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index c1098a1968e1..7969d1f3018d 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -28,7 +28,8 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr,
nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, pkt->skb,
+ nft_hook(pkt));
break;
default:
break;
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 6caa062f68e7..6ac88fe24a8e 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -111,7 +111,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_uid = sk->sk_uid;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
ipcm6_init_sk(&ipc6, np);
ipc6.sockc.mark = sk->sk_mark;
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index bbff3e02e302..d6306aa46bb1 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -126,6 +126,7 @@ static const struct snmp_mib snmp6_udp6_list[] = {
SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI),
+ SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS),
SNMP_MIB_SENTINEL
};
@@ -137,6 +138,7 @@ static const struct snmp_mib snmp6_udplite6_list[] = {
SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 874f01cd7aec..1f56d9aae589 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -746,7 +746,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
skb->csum = csum_block_add(
skb->csum,
csum_partial_copy_nocheck(rfv->c + offset,
- to, copy, 0),
+ to, copy),
odd);
odd = 0;
@@ -915,7 +915,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_oif = np->mcast_oif;
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
if (hdrincl)
fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 1f5d4d196dcc..47a0dc46cbdb 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -42,6 +42,8 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -322,6 +324,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
struct frag_queue *fq;
const struct ipv6hdr *hdr = ipv6_hdr(skb);
struct net *net = dev_net(skb_dst(skb)->dev);
+ u8 nexthdr;
int iif;
if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
@@ -351,6 +354,20 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
return 1;
}
+ /* RFC 8200, Section 4.5 Fragment Header:
+ * If the first fragment does not include all headers through an
+ * Upper-Layer header, then that fragment should be discarded and
+ * an ICMP Parameter Problem, Code 3, message should be sent to
+ * the source of the fragment, with the Pointer field set to zero.
+ */
+ nexthdr = hdr->nexthdr;
+ if (ipv6frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) {
+ __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0);
+ return -1;
+ }
+
iif = skb->dev ? skb->dev->ifindex : 0;
fq = fq_find(net, fhdr->identification, hdr, iif);
if (fq) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fb075d9545b9..188e114b29b4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2745,7 +2745,8 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (confirm_neigh)
dst_confirm_neigh(dst, daddr);
- mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+ if (mtu < IPV6_MIN_MTU)
+ return;
if (mtu >= dst_mtu(dst))
return;
@@ -5284,9 +5285,10 @@ static int ip6_route_multipath_del(struct fib6_config *cfg,
{
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
+ int last_err = 0;
int remaining;
int attrlen;
- int err = 1, last_err = 0;
+ int err;
remaining = cfg->fc_mp_len;
rtnh = (struct rtnexthop *)cfg->fc_mp;
@@ -5556,6 +5558,10 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
goto nla_put_failure;
+
+ if (dst->lwtstate &&
+ lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
} else if (rt->fib6_nsiblings) {
struct fib6_info *sibling, *next_sibling;
struct nlattr *mp;
@@ -6037,11 +6043,6 @@ void fib6_rt_update(struct net *net, struct fib6_info *rt,
struct sk_buff *skb;
int err = -ENOBUFS;
- /* call_fib6_entry_notifiers will be removed when in-kernel notifier
- * is implemented and supported for nexthop objects
- */
- call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
-
skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
if (!skb)
goto errout;
diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
index 307f336b5353..488aec9e1a74 100644
--- a/net/ipv6/rpl.c
+++ b/net/ipv6/rpl.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/**
+/*
* Authors:
* (C) 2020 Alexander Aring <alex.aring@gmail.com>
*/
diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c
index 5fdf3ebb953f..ff691d9f4a04 100644
--- a/net/ipv6/rpl_iptunnel.c
+++ b/net/ipv6/rpl_iptunnel.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/**
+/*
* Authors:
* (C) 2020 Alexander Aring <alex.aring@gmail.com>
*/
@@ -190,18 +190,13 @@ static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt)
{
struct dst_entry *dst = skb_dst(skb);
struct rpl_iptunnel_encap *tinfo;
- int err = 0;
if (skb->protocol != htons(ETH_P_IPV6))
return -EINVAL;
tinfo = rpl_encap_lwtunnel(dst->lwtstate);
- err = rpl_do_srh_inline(skb, rlwt, tinfo->srh);
- if (err)
- return err;
-
- return 0;
+ return rpl_do_srh_inline(skb, rlwt, tinfo->srh);
}
static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 85dddfe3a2c6..687d95dce085 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -35,7 +35,6 @@
#include <net/xfrm.h>
#include <crypto/hash.h>
-#include <crypto/sha.h>
#include <net/seg6.h>
#include <net/genetlink.h>
#include <net/seg6_hmac.h>
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index eba23279912d..b07f7c1c82a4 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -33,11 +33,35 @@
struct seg6_local_lwt;
+/* callbacks used for customizing the creation and destruction of a behavior */
+struct seg6_local_lwtunnel_ops {
+ int (*build_state)(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack);
+ void (*destroy_state)(struct seg6_local_lwt *slwt);
+};
+
struct seg6_action_desc {
int action;
unsigned long attrs;
+
+ /* The optattrs field is used for specifying all the optional
+ * attributes supported by a specific behavior.
+ * It means that if one of these attributes is not provided in the
+ * netlink message during the behavior creation, no errors will be
+ * returned to the userspace.
+ *
+ * Each attribute can be only of two types (mutually exclusive):
+ * 1) required or 2) optional.
+ * Every user MUST obey to this rule! If you set an attribute as
+ * required the same attribute CANNOT be set as optional and vice
+ * versa.
+ */
+ unsigned long optattrs;
+
int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
int static_headroom;
+
+ struct seg6_local_lwtunnel_ops slwt_ops;
};
struct bpf_lwt_prog {
@@ -45,6 +69,28 @@ struct bpf_lwt_prog {
char *name;
};
+enum seg6_end_dt_mode {
+ DT_INVALID_MODE = -EINVAL,
+ DT_LEGACY_MODE = 0,
+ DT_VRF_MODE = 1,
+};
+
+struct seg6_end_dt_info {
+ enum seg6_end_dt_mode mode;
+
+ struct net *net;
+ /* VRF device associated to the routing table used by the SRv6
+ * End.DT4/DT6 behavior for routing IPv4/IPv6 packets.
+ */
+ int vrf_ifindex;
+ int vrf_table;
+
+ /* tunneled packet proto and family (IPv4 or IPv6) */
+ __be16 proto;
+ u16 family;
+ int hdrlen;
+};
+
struct seg6_local_lwt {
int action;
struct ipv6_sr_hdr *srh;
@@ -54,9 +100,16 @@ struct seg6_local_lwt {
int iif;
int oif;
struct bpf_lwt_prog bpf;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ struct seg6_end_dt_info dt_info;
+#endif
int headroom;
struct seg6_action_desc *desc;
+ /* unlike the required attrs, we have to track the optional attributes
+ * that have been effectively parsed.
+ */
+ unsigned long parsed_optattrs;
};
static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
@@ -401,6 +454,248 @@ drop:
return -EINVAL;
}
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static struct net *fib6_config_get_net(const struct fib6_config *fib6_cfg)
+{
+ const struct nl_info *nli = &fib6_cfg->fc_nlinfo;
+
+ return nli->nl_net;
+}
+
+static int __seg6_end_dt_vrf_build(struct seg6_local_lwt *slwt, const void *cfg,
+ u16 family, struct netlink_ext_ack *extack)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+ int vrf_ifindex;
+ struct net *net;
+
+ net = fib6_config_get_net(cfg);
+
+ /* note that vrf_table was already set by parse_nla_vrftable() */
+ vrf_ifindex = l3mdev_ifindex_lookup_by_table_id(L3MDEV_TYPE_VRF, net,
+ info->vrf_table);
+ if (vrf_ifindex < 0) {
+ if (vrf_ifindex == -EPERM) {
+ NL_SET_ERR_MSG(extack,
+ "Strict mode for VRF is disabled");
+ } else if (vrf_ifindex == -ENODEV) {
+ NL_SET_ERR_MSG(extack,
+ "Table has no associated VRF device");
+ } else {
+ pr_debug("seg6local: SRv6 End.DT* creation error=%d\n",
+ vrf_ifindex);
+ }
+
+ return vrf_ifindex;
+ }
+
+ info->net = net;
+ info->vrf_ifindex = vrf_ifindex;
+
+ switch (family) {
+ case AF_INET:
+ info->proto = htons(ETH_P_IP);
+ info->hdrlen = sizeof(struct iphdr);
+ break;
+ case AF_INET6:
+ info->proto = htons(ETH_P_IPV6);
+ info->hdrlen = sizeof(struct ipv6hdr);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ info->family = family;
+ info->mode = DT_VRF_MODE;
+
+ return 0;
+}
+
+/* The SRv6 End.DT4/DT6 behavior extracts the inner (IPv4/IPv6) packet and
+ * routes the IPv4/IPv6 packet by looking at the configured routing table.
+ *
+ * In the SRv6 End.DT4/DT6 use case, we can receive traffic (IPv6+Segment
+ * Routing Header packets) from several interfaces and the outer IPv6
+ * destination address (DA) is used for retrieving the specific instance of the
+ * End.DT4/DT6 behavior that should process the packets.
+ *
+ * However, the inner IPv4/IPv6 packet is not really bound to any receiving
+ * interface and thus the End.DT4/DT6 sets the VRF (associated with the
+ * corresponding routing table) as the *receiving* interface.
+ * In other words, the End.DT4/DT6 processes a packet as if it has been received
+ * directly by the VRF (and not by one of its slave devices, if any).
+ * In this way, the VRF interface is used for routing the IPv4/IPv6 packet in
+ * according to the routing table configured by the End.DT4/DT6 instance.
+ *
+ * This design allows you to get some interesting features like:
+ * 1) the statistics on rx packets;
+ * 2) the possibility to install a packet sniffer on the receiving interface
+ * (the VRF one) for looking at the incoming packets;
+ * 3) the possibility to leverage the netfilter prerouting hook for the inner
+ * IPv4 packet.
+ *
+ * This function returns:
+ * - the sk_buff* when the VRF rcv handler has processed the packet correctly;
+ * - NULL when the skb is consumed by the VRF rcv handler;
+ * - a pointer which encodes a negative error number in case of error.
+ * Note that in this case, the function takes care of freeing the skb.
+ */
+static struct sk_buff *end_dt_vrf_rcv(struct sk_buff *skb, u16 family,
+ struct net_device *dev)
+{
+ /* based on l3mdev_ip_rcv; we are only interested in the master */
+ if (unlikely(!netif_is_l3_master(dev) && !netif_has_l3_rx_handler(dev)))
+ goto drop;
+
+ if (unlikely(!dev->l3mdev_ops->l3mdev_l3_rcv))
+ goto drop;
+
+ /* the decap packet IPv4/IPv6 does not come with any mac header info.
+ * We must unset the mac header to allow the VRF device to rebuild it,
+ * just in case there is a sniffer attached on the device.
+ */
+ skb_unset_mac_header(skb);
+
+ skb = dev->l3mdev_ops->l3mdev_l3_rcv(dev, skb, family);
+ if (!skb)
+ /* the skb buffer was consumed by the handler */
+ return NULL;
+
+ /* when a packet is received by a VRF or by one of its slaves, the
+ * master device reference is set into the skb.
+ */
+ if (unlikely(skb->dev != dev || skb->skb_iif != dev->ifindex))
+ goto drop;
+
+ return skb;
+
+drop:
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct net_device *end_dt_get_vrf_rcu(struct sk_buff *skb,
+ struct seg6_end_dt_info *info)
+{
+ int vrf_ifindex = info->vrf_ifindex;
+ struct net *net = info->net;
+
+ if (unlikely(vrf_ifindex < 0))
+ goto error;
+
+ if (unlikely(!net_eq(dev_net(skb->dev), net)))
+ goto error;
+
+ return dev_get_by_index_rcu(net, vrf_ifindex);
+
+error:
+ return NULL;
+}
+
+static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+ struct net_device *vrf;
+
+ vrf = end_dt_get_vrf_rcu(skb, info);
+ if (unlikely(!vrf))
+ goto drop;
+
+ skb->protocol = info->proto;
+
+ skb_dst_drop(skb);
+
+ skb_set_transport_header(skb, info->hdrlen);
+
+ return end_dt_vrf_rcv(skb, info->family, vrf);
+
+drop:
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+}
+
+static int input_action_end_dt4(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ struct iphdr *iph;
+ int err;
+
+ if (!decap_and_validate(skb, IPPROTO_IPIP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto drop;
+
+ skb = end_dt_vrf_core(skb, slwt);
+ if (!skb)
+ /* packet has been processed and consumed by the VRF */
+ return 0;
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ iph = ip_hdr(skb);
+
+ err = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev);
+ if (unlikely(err))
+ goto drop;
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int seg6_end_dt4_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET, extack);
+}
+
+static enum
+seg6_end_dt_mode seg6_end_dt6_parse_mode(struct seg6_local_lwt *slwt)
+{
+ unsigned long parsed_optattrs = slwt->parsed_optattrs;
+ bool legacy, vrfmode;
+
+ legacy = !!(parsed_optattrs & (1 << SEG6_LOCAL_TABLE));
+ vrfmode = !!(parsed_optattrs & (1 << SEG6_LOCAL_VRFTABLE));
+
+ if (!(legacy ^ vrfmode))
+ /* both are absent or present: invalid DT6 mode */
+ return DT_INVALID_MODE;
+
+ return legacy ? DT_LEGACY_MODE : DT_VRF_MODE;
+}
+
+static enum seg6_end_dt_mode seg6_end_dt6_get_mode(struct seg6_local_lwt *slwt)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+
+ return info->mode;
+}
+
+static int seg6_end_dt6_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ enum seg6_end_dt_mode mode = seg6_end_dt6_parse_mode(slwt);
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+
+ switch (mode) {
+ case DT_LEGACY_MODE:
+ info->mode = DT_LEGACY_MODE;
+ return 0;
+ case DT_VRF_MODE:
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET6, extack);
+ default:
+ NL_SET_ERR_MSG(extack, "table or vrftable must be specified");
+ return -EINVAL;
+ }
+}
+#endif
+
static int input_action_end_dt6(struct sk_buff *skb,
struct seg6_local_lwt *slwt)
{
@@ -410,6 +705,28 @@ static int input_action_end_dt6(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ if (seg6_end_dt6_get_mode(slwt) == DT_LEGACY_MODE)
+ goto legacy_mode;
+
+ /* DT6_VRF_MODE */
+ skb = end_dt_vrf_core(skb, slwt);
+ if (!skb)
+ /* packet has been processed and consumed by the VRF */
+ return 0;
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* note: this time we do not need to specify the table because the VRF
+ * takes care of selecting the correct table.
+ */
+ seg6_lookup_any_nexthop(skb, NULL, 0, true);
+
+ return dst_input(skb);
+
+legacy_mode:
+#endif
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
seg6_lookup_any_nexthop(skb, NULL, slwt->table, true);
@@ -590,8 +907,27 @@ static struct seg6_action_desc seg6_action_table[] = {
.input = input_action_end_dx4,
},
{
+ .action = SEG6_LOCAL_ACTION_END_DT4,
+ .attrs = (1 << SEG6_LOCAL_VRFTABLE),
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .input = input_action_end_dt4,
+ .slwt_ops = {
+ .build_state = seg6_end_dt4_build,
+ },
+#endif
+ },
+ {
.action = SEG6_LOCAL_ACTION_END_DT6,
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .attrs = 0,
+ .optattrs = (1 << SEG6_LOCAL_TABLE) |
+ (1 << SEG6_LOCAL_VRFTABLE),
+ .slwt_ops = {
+ .build_state = seg6_end_dt6_build,
+ },
+#else
.attrs = (1 << SEG6_LOCAL_TABLE),
+#endif
.input = input_action_end_dt6,
},
{
@@ -649,6 +985,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_ACTION] = { .type = NLA_U32 },
[SEG6_LOCAL_SRH] = { .type = NLA_BINARY },
[SEG6_LOCAL_TABLE] = { .type = NLA_U32 },
+ [SEG6_LOCAL_VRFTABLE] = { .type = NLA_U32 },
[SEG6_LOCAL_NH4] = { .type = NLA_BINARY,
.len = sizeof(struct in_addr) },
[SEG6_LOCAL_NH6] = { .type = NLA_BINARY,
@@ -710,6 +1047,11 @@ static int cmp_nla_srh(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return memcmp(a->srh, b->srh, len);
}
+static void destroy_attr_srh(struct seg6_local_lwt *slwt)
+{
+ kfree(slwt->srh);
+}
+
static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt)
{
slwt->table = nla_get_u32(attrs[SEG6_LOCAL_TABLE]);
@@ -733,6 +1075,53 @@ static int cmp_nla_table(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return 0;
}
+static struct
+seg6_end_dt_info *seg6_possible_end_dt_info(struct seg6_local_lwt *slwt)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ return &slwt->dt_info;
+#else
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+}
+
+static int parse_nla_vrftable(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt)
+{
+ struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt);
+
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
+ info->vrf_table = nla_get_u32(attrs[SEG6_LOCAL_VRFTABLE]);
+
+ return 0;
+}
+
+static int put_nla_vrftable(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt);
+
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
+ if (nla_put_u32(skb, SEG6_LOCAL_VRFTABLE, info->vrf_table))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int cmp_nla_vrftable(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ struct seg6_end_dt_info *info_a = seg6_possible_end_dt_info(a);
+ struct seg6_end_dt_info *info_b = seg6_possible_end_dt_info(b);
+
+ if (info_a->vrf_table != info_b->vrf_table)
+ return 1;
+
+ return 0;
+}
+
static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt)
{
memcpy(&slwt->nh4, nla_data(attrs[SEG6_LOCAL_NH4]),
@@ -901,16 +1290,30 @@ static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return strcmp(a->bpf.name, b->bpf.name);
}
+static void destroy_attr_bpf(struct seg6_local_lwt *slwt)
+{
+ kfree(slwt->bpf.name);
+ if (slwt->bpf.prog)
+ bpf_prog_put(slwt->bpf.prog);
+}
+
struct seg6_action_param {
int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b);
+
+ /* optional destroy() callback useful for releasing resources which
+ * have been previously acquired in the corresponding parse()
+ * function.
+ */
+ void (*destroy)(struct seg6_local_lwt *slwt);
};
static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_SRH] = { .parse = parse_nla_srh,
.put = put_nla_srh,
- .cmp = cmp_nla_srh },
+ .cmp = cmp_nla_srh,
+ .destroy = destroy_attr_srh },
[SEG6_LOCAL_TABLE] = { .parse = parse_nla_table,
.put = put_nla_table,
@@ -934,14 +1337,130 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf,
.put = put_nla_bpf,
- .cmp = cmp_nla_bpf },
+ .cmp = cmp_nla_bpf,
+ .destroy = destroy_attr_bpf },
+
+ [SEG6_LOCAL_VRFTABLE] = { .parse = parse_nla_vrftable,
+ .put = put_nla_vrftable,
+ .cmp = cmp_nla_vrftable },
};
+/* call the destroy() callback (if available) for each set attribute in
+ * @parsed_attrs, starting from the first attribute up to the @max_parsed
+ * (excluded) attribute.
+ */
+static void __destroy_attrs(unsigned long parsed_attrs, int max_parsed,
+ struct seg6_local_lwt *slwt)
+{
+ struct seg6_action_param *param;
+ int i;
+
+ /* Every required seg6local attribute is identified by an ID which is
+ * encoded as a flag (i.e: 1 << ID) in the 'attrs' bitmask;
+ *
+ * We scan the 'parsed_attrs' bitmask, starting from the first attribute
+ * up to the @max_parsed (excluded) attribute.
+ * For each set attribute, we retrieve the corresponding destroy()
+ * callback. If the callback is not available, then we skip to the next
+ * attribute; otherwise, we call the destroy() callback.
+ */
+ for (i = 0; i < max_parsed; ++i) {
+ if (!(parsed_attrs & (1 << i)))
+ continue;
+
+ param = &seg6_action_params[i];
+
+ if (param->destroy)
+ param->destroy(slwt);
+ }
+}
+
+/* release all the resources that may have been acquired during parsing
+ * operations.
+ */
+static void destroy_attrs(struct seg6_local_lwt *slwt)
+{
+ unsigned long attrs = slwt->desc->attrs | slwt->parsed_optattrs;
+
+ __destroy_attrs(attrs, SEG6_LOCAL_MAX + 1, slwt);
+}
+
+static int parse_nla_optional_attrs(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ unsigned long parsed_optattrs = 0;
+ struct seg6_action_param *param;
+ int err, i;
+
+ for (i = 0; i < SEG6_LOCAL_MAX + 1; ++i) {
+ if (!(desc->optattrs & (1 << i)) || !attrs[i])
+ continue;
+
+ /* once here, the i-th attribute is provided by the
+ * userspace AND it is identified optional as well.
+ */
+ param = &seg6_action_params[i];
+
+ err = param->parse(attrs, slwt);
+ if (err < 0)
+ goto parse_optattrs_err;
+
+ /* current attribute has been correctly parsed */
+ parsed_optattrs |= (1 << i);
+ }
+
+ /* store in the tunnel state all the optional attributed successfully
+ * parsed.
+ */
+ slwt->parsed_optattrs = parsed_optattrs;
+
+ return 0;
+
+parse_optattrs_err:
+ __destroy_attrs(parsed_optattrs, i, slwt);
+
+ return err;
+}
+
+/* call the custom constructor of the behavior during its initialization phase
+ * and after that all its attributes have been parsed successfully.
+ */
+static int
+seg6_local_lwtunnel_build_state(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ struct seg6_local_lwtunnel_ops *ops;
+
+ ops = &desc->slwt_ops;
+ if (!ops->build_state)
+ return 0;
+
+ return ops->build_state(slwt, cfg, extack);
+}
+
+/* call the custom destructor of the behavior which is invoked before the
+ * tunnel is going to be destroyed.
+ */
+static void seg6_local_lwtunnel_destroy_state(struct seg6_local_lwt *slwt)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ struct seg6_local_lwtunnel_ops *ops;
+
+ ops = &desc->slwt_ops;
+ if (!ops->destroy_state)
+ return;
+
+ ops->destroy_state(slwt);
+}
+
static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
{
struct seg6_action_param *param;
struct seg6_action_desc *desc;
+ unsigned long invalid_attrs;
int i, err;
desc = __get_action_desc(slwt->action);
@@ -954,6 +1473,26 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
slwt->desc = desc;
slwt->headroom += desc->static_headroom;
+ /* Forcing the desc->optattrs *set* and the desc->attrs *set* to be
+ * disjoined, this allow us to release acquired resources by optional
+ * attributes and by required attributes independently from each other
+ * without any interfarence.
+ * In other terms, we are sure that we do not release some the acquired
+ * resources twice.
+ *
+ * Note that if an attribute is configured both as required and as
+ * optional, it means that the user has messed something up in the
+ * seg6_action_table. Therefore, this check is required for SRv6
+ * behaviors to work properly.
+ */
+ invalid_attrs = desc->attrs & desc->optattrs;
+ if (invalid_attrs) {
+ WARN_ONCE(1,
+ "An attribute cannot be both required AND optional");
+ return -EINVAL;
+ }
+
+ /* parse the required attributes */
for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
if (desc->attrs & (1 << i)) {
if (!attrs[i])
@@ -963,11 +1502,24 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
err = param->parse(attrs, slwt);
if (err < 0)
- return err;
+ goto parse_attrs_err;
}
}
+ /* parse the optional attributes, if any */
+ err = parse_nla_optional_attrs(attrs, slwt);
+ if (err < 0)
+ goto parse_attrs_err;
+
return 0;
+
+parse_attrs_err:
+ /* release any resource that may have been acquired during the i-1
+ * parse() operations.
+ */
+ __destroy_attrs(desc->attrs, i, slwt);
+
+ return err;
}
static int seg6_local_build_state(struct net *net, struct nlattr *nla,
@@ -1003,6 +1555,10 @@ static int seg6_local_build_state(struct net *net, struct nlattr *nla,
if (err < 0)
goto out_free;
+ err = seg6_local_lwtunnel_build_state(slwt, cfg, extack);
+ if (err < 0)
+ goto out_destroy_attrs;
+
newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL;
newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT;
newts->headroom = slwt->headroom;
@@ -1011,8 +1567,9 @@ static int seg6_local_build_state(struct net *net, struct nlattr *nla,
return 0;
+out_destroy_attrs:
+ destroy_attrs(slwt);
out_free:
- kfree(slwt->srh);
kfree(newts);
return err;
}
@@ -1021,12 +1578,9 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
{
struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
- kfree(slwt->srh);
+ seg6_local_lwtunnel_destroy_state(slwt);
- if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) {
- kfree(slwt->bpf.name);
- bpf_prog_put(slwt->bpf.prog);
- }
+ destroy_attrs(slwt);
return;
}
@@ -1036,13 +1590,16 @@ static int seg6_local_fill_encap(struct sk_buff *skb,
{
struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
struct seg6_action_param *param;
+ unsigned long attrs;
int i, err;
if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action))
return -EMSGSIZE;
+ attrs = slwt->desc->attrs | slwt->parsed_optattrs;
+
for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
- if (slwt->desc->attrs & (1 << i)) {
+ if (attrs & (1 << i)) {
param = &seg6_action_params[i];
err = param->put(skb, slwt);
if (err < 0)
@@ -1061,7 +1618,7 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
nlsize = nla_total_size(4); /* action */
- attrs = slwt->desc->attrs;
+ attrs = slwt->desc->attrs | slwt->parsed_optattrs;
if (attrs & (1 << SEG6_LOCAL_SRH))
nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3);
@@ -1086,6 +1643,9 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
nla_total_size(MAX_PROG_NAME) +
nla_total_size(4);
+ if (attrs & (1 << SEG6_LOCAL_VRFTABLE))
+ nlsize += nla_total_size(4);
+
return nlsize;
}
@@ -1094,6 +1654,7 @@ static int seg6_local_cmp_encap(struct lwtunnel_state *a,
{
struct seg6_local_lwt *slwt_a, *slwt_b;
struct seg6_action_param *param;
+ unsigned long attrs_a, attrs_b;
int i;
slwt_a = seg6_local_lwtunnel(a);
@@ -1102,11 +1663,14 @@ static int seg6_local_cmp_encap(struct lwtunnel_state *a,
if (slwt_a->action != slwt_b->action)
return 1;
- if (slwt_a->desc->attrs != slwt_b->desc->attrs)
+ attrs_a = slwt_a->desc->attrs | slwt_a->parsed_optattrs;
+ attrs_b = slwt_b->desc->attrs | slwt_b->parsed_optattrs;
+
+ if (attrs_a != attrs_b)
return 1;
for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
- if (slwt_a->desc->attrs & (1 << i)) {
+ if (attrs_a & (1 << i)) {
param = &seg6_action_params[i];
if (param->cmp(slwt_a, slwt_b))
return 1;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 5e2c34c0ac97..93636867aee2 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1128,7 +1128,6 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev)
if (tdev && !netif_is_l3_master(tdev)) {
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
- dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
dev->mtu = tdev->mtu - t_hlen;
if (dev->mtu < IPV6_MIN_MTU)
dev->mtu = IPV6_MIN_MTU;
@@ -1396,7 +1395,7 @@ static const struct net_device_ops ipip6_netdev_ops = {
.ndo_uninit = ipip6_tunnel_uninit,
.ndo_start_xmit = sit_tunnel_xmit,
.ndo_do_ioctl = ipip6_tunnel_ioctl,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
.ndo_tunnel_ctl = ipip6_tunnel_ctl,
};
@@ -1426,7 +1425,6 @@ static void ipip6_tunnel_setup(struct net_device *dev)
dev->priv_destructor = ipip6_dev_free;
dev->type = ARPHRD_SIT;
- dev->hard_header_len = LL_MAX_HEADER + t_hlen;
dev->mtu = ETH_DATA_LEN - t_hlen;
dev->min_mtu = IPV6_MIN_MTU;
dev->max_mtu = IP6_MAX_MTU - t_hlen;
@@ -1647,8 +1645,11 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev,
}
#ifdef CONFIG_IPV6_SIT_6RD
- if (ipip6_netlink_6rd_parms(data, &ip6rd))
+ if (ipip6_netlink_6rd_parms(data, &ip6rd)) {
err = ipip6_tunnel_update_6rd(nt, &ip6rd);
+ if (err < 0)
+ unregister_netdevice_queue(dev, NULL);
+ }
#endif
return err;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index e796a64be308..e8cfb9e997bf 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -136,7 +136,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
__u32 cookie = ntohl(th->ack_seq) - 1;
struct sock *ret = sk;
struct request_sock *req;
- int mss;
+ int full_space, mss;
struct dst_entry *dst;
__u8 rcv_wscale;
u32 tsoff = 0;
@@ -233,7 +233,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = inet_sk(sk)->inet_sport;
fl6.flowi6_uid = sk->sk_uid;
- security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+ security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst))
@@ -241,7 +241,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
}
req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
- tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ full_space = tcp_full_space(sk);
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
+
+ tcp_select_initial_window(sk, full_space, req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 305870a72352..0e1509b02cb3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -278,7 +278,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
@@ -458,7 +458,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
- * is already accepted it is treated as a connected one below.
+ * already accepted it is treated as a connected one below.
*/
if (fastopen && !fastopen->sk)
break;
@@ -501,7 +501,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
@@ -509,13 +510,14 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb;
int err = -ENOMEM;
+ u8 tclass;
/* First, grab a route. */
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
IPPROTO_TCP)) == NULL)
goto done;
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
@@ -525,12 +527,21 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
if (np->repflow && ireq->pktopts)
fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
+ tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+ (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+ (np->tclass & INET_ECN_MASK) :
+ np->tclass;
+
+ if (!INET_ECN_is_capable(tclass) &&
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
+ tclass |= INET_ECN_ECT_0;
+
rcu_read_lock();
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
- sk->sk_priority);
+ err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
+ tclass, sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -818,9 +829,15 @@ static void tcp_v6_init_req(struct request_sock *req,
}
static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
+ struct sk_buff *skb,
struct flowi *fl,
- const struct request_sock *req)
+ struct request_sock *req)
{
+ tcp_v6_init_req(req, sk, skb);
+
+ if (security_inet_conn_request(sk, skb, req))
+ return NULL;
+
return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP);
}
@@ -841,7 +858,6 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.req_md5_lookup = tcp_v6_md5_lookup,
.calc_md5_hash = tcp_v6_md5_hash_skb,
#endif
- .init_req = tcp_v6_init_req,
#ifdef CONFIG_SYN_COOKIES
.cookie_init_seq = cookie_v6_init_sequence,
#endif
@@ -949,7 +965,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
* Underlying function will use this to retrieve the network
@@ -958,8 +974,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
- ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass,
- priority);
+ ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL,
+ tclass & ~INET_ECN_MASK, priority);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -1067,8 +1083,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
label = ip6_flowlabel(ipv6h);
}
- tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
- label, priority);
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1,
+ ipv6_get_dsfield(ipv6h), label, priority);
#ifdef CONFIG_TCP_MD5SIG
out:
@@ -1121,7 +1137,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index),
- 0, 0, sk->sk_priority);
+ ipv6_get_dsfield(ipv6_hdr(skb)), 0, sk->sk_priority);
}
@@ -1188,6 +1204,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct ipv6_txoptions *opt;
struct inet_sock *newinet;
+ bool found_dup_sk = false;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
@@ -1309,6 +1326,12 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
if (np->repflow)
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+ /* Set ToS of the new socket based upon the value of incoming SYN.
+ * ECT bits are set later in tcp_init_transfer().
+ */
+ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+ newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
/* Clone native IPv6 options from listening socket (if any)
Yes, keeping reference count would be much more clever,
@@ -1359,7 +1382,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
tcp_done(newsk);
goto out;
}
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
+ &found_dup_sk);
if (*own_req) {
tcp_move_syn(newtp, req);
@@ -1374,6 +1398,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
skb_set_owner_r(newnp->pktoptions, newsk);
}
}
+ } else {
+ if (!req_unhash && found_dup_sk) {
+ /* This code path should only be executed in the
+ * syncookie case only
+ */
+ bh_unlock_sock(newsk);
+ sock_put(newsk);
+ newsk = NULL;
+ }
}
return newsk;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 29d9691359b9..b9f3dfdd2383 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -276,7 +276,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
inet6_sdif(skb), udptable, skb);
}
-struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
+struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -285,7 +285,6 @@ struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
&iph->daddr, dport, inet6_iif(skb),
inet6_sdif(skb), &udp_table, NULL);
}
-EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
/* Must be called under rcu_read_lock().
* Does increment socket refcount.
@@ -560,7 +559,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
inet6_iif(skb), inet6_sdif(skb), udptable, NULL);
- if (!sk) {
+ if (!sk || udp_sk(sk)->encap_type) {
/* No socket for error: try tunnels before discarding */
sk = ERR_PTR(-ENOENT);
if (static_branch_unlikely(&udpv6_encap_needed_key)) {
@@ -637,6 +636,9 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (rc == -ENOMEM)
UDP6_INC_STATS(sock_net(sk),
UDP_MIB_RCVBUFERRORS, is_udplite);
+ else
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_MEMERRORS, is_udplite);
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
return -1;
@@ -1496,7 +1498,7 @@ do_udp_sendmsg:
} else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
if (ipc6.tclass < 0)
ipc6.tclass = np->tclass;
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 584157a07759..faa823c24292 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -28,10 +28,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
int tnl_hlen;
int err;
- mss = skb_shinfo(skb)->gso_size;
- if (unlikely(skb->len <= mss))
- goto out;
-
if (skb->encapsulation && skb_shinfo(skb)->gso_type &
(SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
segs = skb_udp_tunnel_segment(skb, features, true);
@@ -46,7 +42,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
goto out;
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
- return __udp_gso_segment(skb, features);
+ return __udp_gso_segment(skb, features, true);
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
* do checksum of UDP packets sent as multiple IP fragments.
@@ -111,12 +111,22 @@ out:
return segs;
}
+static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
+ __be16 dport)
+{
+ const struct ipv6hdr *iph = skb_gro_network_header(skb);
+
+ return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
+ &iph->daddr, dport, inet6_iif(skb),
+ inet6_sdif(skb), &udp_table, NULL);
+}
+
INDIRECT_CALLABLE_SCOPE
struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sock *sk = NULL;
struct sk_buff *pp;
- struct sock *sk;
if (unlikely(!uh))
goto flush;
@@ -135,7 +145,10 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 1;
rcu_read_lock();
- sk = static_branch_unlikely(&udpv6_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL;
+
+ if (static_branch_unlikely(&udpv6_encap_needed_key))
+ sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest);
+
pp = udp_gro_receive(head, skb, uh, sk);
rcu_read_unlock();
return pp;
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 25b7ebda2fab..f696d46e6910 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -303,13 +303,13 @@ static const struct xfrm_type xfrm6_tunnel_type = {
static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = {
.handler = xfrm6_tunnel_rcv,
.err_handler = xfrm6_tunnel_err,
- .priority = 2,
+ .priority = 3,
};
static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = {
.handler = xfrm6_tunnel_rcv,
.err_handler = xfrm6_tunnel_err,
- .priority = 2,
+ .priority = 3,
};
static int __net_init xfrm6_tunnel_net_init(struct net *net)
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index a95af62acb52..882f028992c3 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -587,12 +587,12 @@ static void __iucv_auto_name(struct iucv_sock *iucv)
static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr,
int addr_len)
{
- struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+ DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr);
+ char uid[sizeof(sa->siucv_user_id)];
struct sock *sk = sock->sk;
struct iucv_sock *iucv;
int err = 0;
struct net_device *dev;
- char uid[9];
/* Verify the input sockaddr */
if (addr_len < sizeof(struct sockaddr_iucv) ||
@@ -691,7 +691,7 @@ static int iucv_sock_autobind(struct sock *sk)
static int afiucv_path_connect(struct socket *sock, struct sockaddr *addr)
{
- struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+ DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr);
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
unsigned char user_data[16];
@@ -738,7 +738,7 @@ done:
static int iucv_sock_connect(struct socket *sock, struct sockaddr *addr,
int alen, int flags)
{
- struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+ DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr);
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
int err;
@@ -874,7 +874,7 @@ done:
static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr,
int peer)
{
- struct sockaddr_iucv *siucv = (struct sockaddr_iucv *) addr;
+ DECLARE_SOCKADDR(struct sockaddr_iucv *, siucv, addr);
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
@@ -1434,7 +1434,8 @@ static int iucv_sock_shutdown(struct socket *sock, int how)
break;
}
- if (how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) {
+ if ((how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) &&
+ sk->sk_state == IUCV_CONNECTED) {
if (iucv->transport == AF_IUCV_TRANS_IUCV) {
txmsg.class = 0;
txmsg.tag = 0;
@@ -1644,7 +1645,7 @@ static int iucv_callback_connreq(struct iucv_path *path,
}
/* Create the new socket */
- nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
+ nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0);
if (!nsk) {
err = pr_iucv->path_sever(path, user_data);
iucv_path_free(path);
@@ -1850,7 +1851,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
goto out;
}
- nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
+ nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0);
bh_lock_sock(sk);
if ((sk->sk_state != IUCV_LISTEN) ||
sk_acceptq_is_full(sk) ||
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index cd2e468852e7..349c6ac3313f 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -1116,10 +1116,9 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
if (msg->flags & IUCV_IPRMDATA)
return iucv_message_receive_iprmdata(path, msg, flags,
buffer, size, residual);
- if (cpumask_empty(&iucv_buffer_cpumask)) {
- rc = -EIO;
- goto out;
- }
+ if (cpumask_empty(&iucv_buffer_cpumask))
+ return -EIO;
+
parm = iucv_param[smp_processor_id()];
memset(parm, 0, sizeof(union iucv_param));
parm->db.ipbfadr1 = (u32)(addr_t) buffer;
@@ -1135,7 +1134,6 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
if (residual)
*residual = parm->db.ipbfln1f;
}
-out:
return rc;
}
EXPORT_SYMBOL(__iucv_message_receive);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index c12dbc51ef5f..ef9b4ac03e7b 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2902,7 +2902,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t)
break;
if (!aalg->pfkey_supported)
continue;
- if (aalg_tmpl_set(t, aalg) && aalg->available)
+ if (aalg_tmpl_set(t, aalg))
sz += sizeof(struct sadb_comb);
}
return sz + sizeof(struct sadb_prop);
@@ -2920,7 +2920,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t)
if (!ealg->pfkey_supported)
continue;
- if (!(ealg_tmpl_set(t, ealg) && ealg->available))
+ if (!(ealg_tmpl_set(t, ealg)))
continue;
for (k = 1; ; k++) {
@@ -2931,7 +2931,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t)
if (!aalg->pfkey_supported)
continue;
- if (aalg_tmpl_set(t, aalg) && aalg->available)
+ if (aalg_tmpl_set(t, aalg))
sz += sizeof(struct sadb_comb);
}
}
diff --git a/net/l2tp/Makefile b/net/l2tp/Makefile
index 399a7e5db2f4..cf8f27071d3f 100644
--- a/net/l2tp/Makefile
+++ b/net/l2tp/Makefile
@@ -5,6 +5,8 @@
obj-$(CONFIG_L2TP) += l2tp_core.o
+CFLAGS_l2tp_core.o += -I$(src)
+
# Build l2tp as modules if L2TP is M
obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_PPPOL2TP)) += l2tp_ppp.o
obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_IP)) += l2tp_ip.o
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 701fc72ad9f4..7be5103ff2a8 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -61,6 +61,10 @@
#include <linux/atomic.h>
#include "l2tp_core.h"
+#include "trace.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
#define L2TP_DRV_VERSION "V2.0"
@@ -116,11 +120,6 @@ static bool l2tp_sk_is_v6(struct sock *sk)
}
#endif
-static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk)
-{
- return sk->sk_user_data;
-}
-
static inline struct l2tp_net *l2tp_pernet(const struct net *net)
{
return net_generic(net, l2tp_net_id);
@@ -151,23 +150,30 @@ l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id)
static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
{
+ trace_free_tunnel(tunnel);
sock_put(tunnel->sock);
/* the tunnel is freed in the socket destructor */
}
static void l2tp_session_free(struct l2tp_session *session)
{
- struct l2tp_tunnel *tunnel = session->tunnel;
+ trace_free_session(session);
+ if (session->tunnel)
+ l2tp_tunnel_dec_refcount(session->tunnel);
+ kfree(session);
+}
- if (tunnel) {
+struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk)
+{
+ struct l2tp_tunnel *tunnel = sk->sk_user_data;
+
+ if (tunnel)
if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC))
- goto out;
- l2tp_tunnel_dec_refcount(tunnel);
- }
+ return NULL;
-out:
- kfree(session);
+ return tunnel;
}
+EXPORT_SYMBOL_GPL(l2tp_sk_to_tunnel);
void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel)
{
@@ -381,6 +387,8 @@ int l2tp_session_register(struct l2tp_session *session,
hlist_add_head(&session->hlist, head);
write_unlock_bh(&tunnel->hlist_lock);
+ trace_register_session(session);
+
return 0;
err_tlock_pnlock:
@@ -409,10 +417,6 @@ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *sk
skb_queue_walk_safe(&session->reorder_q, skbp, tmp) {
if (L2TP_SKB_CB(skbp)->ns > ns) {
__skb_queue_before(&session->reorder_q, skbp, skb);
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: pkt %hu, inserted before %hu, reorder_q len=%d\n",
- session->name, ns, L2TP_SKB_CB(skbp)->ns,
- skb_queue_len(&session->reorder_q));
atomic_long_inc(&session->stats.rx_oos_packets);
goto out;
}
@@ -445,9 +449,7 @@ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff *
/* Bump our Nr */
session->nr++;
session->nr &= session->nr_max;
-
- l2tp_dbg(session, L2TP_MSG_SEQ, "%s: updated nr to %hu\n",
- session->name, session->nr);
+ trace_session_seqnum_update(session);
}
/* call private receive handler */
@@ -472,37 +474,27 @@ static void l2tp_recv_dequeue(struct l2tp_session *session)
start:
spin_lock_bh(&session->reorder_q.lock);
skb_queue_walk_safe(&session->reorder_q, skb, tmp) {
- if (time_after(jiffies, L2TP_SKB_CB(skb)->expires)) {
+ struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb);
+
+ /* If the packet has been pending on the queue for too long, discard it */
+ if (time_after(jiffies, cb->expires)) {
atomic_long_inc(&session->stats.rx_seq_discards);
atomic_long_inc(&session->stats.rx_errors);
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: oos pkt %u len %d discarded (too old), waiting for %u, reorder_q_len=%d\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr,
- skb_queue_len(&session->reorder_q));
+ trace_session_pkt_expired(session, cb->ns);
session->reorder_skip = 1;
__skb_unlink(skb, &session->reorder_q);
kfree_skb(skb);
continue;
}
- if (L2TP_SKB_CB(skb)->has_seq) {
+ if (cb->has_seq) {
if (session->reorder_skip) {
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: advancing nr to next pkt: %u -> %u",
- session->name, session->nr,
- L2TP_SKB_CB(skb)->ns);
session->reorder_skip = 0;
- session->nr = L2TP_SKB_CB(skb)->ns;
+ session->nr = cb->ns;
+ trace_session_seqnum_reset(session);
}
- if (L2TP_SKB_CB(skb)->ns != session->nr) {
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: holding oos pkt %u len %d, waiting for %u, reorder_q_len=%d\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr,
- skb_queue_len(&session->reorder_q));
+ if (cb->ns != session->nr)
goto out;
- }
}
__skb_unlink(skb, &session->reorder_q);
@@ -535,14 +527,13 @@ static int l2tp_seq_check_rx_window(struct l2tp_session *session, u32 nr)
*/
static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb)
{
- if (!l2tp_seq_check_rx_window(session, L2TP_SKB_CB(skb)->ns)) {
+ struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb);
+
+ if (!l2tp_seq_check_rx_window(session, cb->ns)) {
/* Packet sequence number is outside allowed window.
* Discard it.
*/
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: pkt %u len %d discarded, outside window, nr=%u\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr);
+ trace_session_pkt_outside_rx_window(session, cb->ns);
goto discard;
}
@@ -559,10 +550,10 @@ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb)
* is seen. After nr_oos_count_max in-sequence packets, reset the
* sequence number to re-enable packet reception.
*/
- if (L2TP_SKB_CB(skb)->ns == session->nr) {
+ if (cb->ns == session->nr) {
skb_queue_tail(&session->reorder_q, skb);
} else {
- u32 nr_oos = L2TP_SKB_CB(skb)->ns;
+ u32 nr_oos = cb->ns;
u32 nr_next = (session->nr_oos + 1) & session->nr_max;
if (nr_oos == nr_next)
@@ -573,17 +564,10 @@ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb)
session->nr_oos = nr_oos;
if (session->nr_oos_count > session->nr_oos_count_max) {
session->reorder_skip = 1;
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: %d oos packets received. Resetting sequence numbers\n",
- session->name, session->nr_oos_count);
}
if (!session->reorder_skip) {
atomic_long_inc(&session->stats.rx_seq_discards);
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: oos pkt %u len %d discarded, waiting for %u, reorder_q_len=%d\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr,
- skb_queue_len(&session->reorder_q));
+ trace_session_pkt_oos(session, cb->ns);
goto discard;
}
skb_queue_tail(&session->reorder_q, skb);
@@ -660,16 +644,14 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
int length)
{
struct l2tp_tunnel *tunnel = session->tunnel;
- u32 ns = 0, nr = 0;
int offset;
/* Parse and check optional cookie */
if (session->peer_cookie_len > 0) {
if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) {
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: cookie mismatch (%u/%u). Discarding.\n",
- tunnel->name, tunnel->tunnel_id,
- session->session_id);
+ pr_warn_ratelimited("%s: cookie mismatch (%u/%u). Discarding.\n",
+ tunnel->name, tunnel->tunnel_id,
+ session->session_id);
atomic_long_inc(&session->stats.rx_cookie_discards);
goto discard;
}
@@ -686,32 +668,21 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
L2TP_SKB_CB(skb)->has_seq = 0;
if (tunnel->version == L2TP_HDR_VER_2) {
if (hdrflags & L2TP_HDRFLAG_S) {
- ns = ntohs(*(__be16 *)ptr);
- ptr += 2;
- nr = ntohs(*(__be16 *)ptr);
- ptr += 2;
-
/* Store L2TP info in the skb */
- L2TP_SKB_CB(skb)->ns = ns;
+ L2TP_SKB_CB(skb)->ns = ntohs(*(__be16 *)ptr);
L2TP_SKB_CB(skb)->has_seq = 1;
+ ptr += 2;
+ /* Skip past nr in the header */
+ ptr += 2;
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: recv data ns=%u, nr=%u, session nr=%u\n",
- session->name, ns, nr, session->nr);
}
} else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
u32 l2h = ntohl(*(__be32 *)ptr);
if (l2h & 0x40000000) {
- ns = l2h & 0x00ffffff;
-
/* Store L2TP info in the skb */
- L2TP_SKB_CB(skb)->ns = ns;
+ L2TP_SKB_CB(skb)->ns = l2h & 0x00ffffff;
L2TP_SKB_CB(skb)->has_seq = 1;
-
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: recv data ns=%u, session nr=%u\n",
- session->name, ns, session->nr);
}
ptr += 4;
}
@@ -722,9 +693,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
* configure it so.
*/
if (!session->lns_mode && !session->send_seq) {
- l2tp_info(session, L2TP_MSG_SEQ,
- "%s: requested to enable seq numbers by LNS\n",
- session->name);
+ trace_session_seqnum_lns_enable(session);
session->send_seq = 1;
l2tp_session_set_header_len(session, tunnel->version);
}
@@ -733,9 +702,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
* If user has configured mandatory sequence numbers, discard.
*/
if (session->recv_seq) {
- l2tp_warn(session, L2TP_MSG_SEQ,
- "%s: recv data has no seq numbers when required. Discarding.\n",
- session->name);
+ pr_warn_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n",
+ session->name);
atomic_long_inc(&session->stats.rx_seq_discards);
goto discard;
}
@@ -746,15 +714,12 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
* LAC is broken. Discard the frame.
*/
if (!session->lns_mode && session->send_seq) {
- l2tp_info(session, L2TP_MSG_SEQ,
- "%s: requested to disable seq numbers by LNS\n",
- session->name);
+ trace_session_seqnum_lns_disable(session);
session->send_seq = 0;
l2tp_session_set_header_len(session, tunnel->version);
} else if (session->send_seq) {
- l2tp_warn(session, L2TP_MSG_SEQ,
- "%s: recv data has no seq numbers when required. Discarding.\n",
- session->name);
+ pr_warn_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n",
+ session->name);
atomic_long_inc(&session->stats.rx_seq_discards);
goto discard;
}
@@ -816,9 +781,6 @@ static void l2tp_session_queue_purge(struct l2tp_session *session)
{
struct sk_buff *skb = NULL;
- if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
- return;
-
while ((skb = skb_dequeue(&session->reorder_q))) {
atomic_long_inc(&session->stats.rx_errors);
kfree_skb(skb);
@@ -847,22 +809,11 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
/* Short packet? */
if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) {
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: recv short packet (len=%d)\n",
- tunnel->name, skb->len);
+ pr_warn_ratelimited("%s: recv short packet (len=%d)\n",
+ tunnel->name, skb->len);
goto error;
}
- /* Trace packet contents, if enabled */
- if (tunnel->debug & L2TP_MSG_DATA) {
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto error;
-
- pr_debug("%s: recv\n", tunnel->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length);
- }
-
/* Point to L2TP header */
optr = skb->data;
ptr = skb->data;
@@ -873,9 +824,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
/* Check protocol version */
version = hdrflags & L2TP_HDR_VER_MASK;
if (version != tunnel->version) {
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: recv protocol version mismatch: got %d expected %d\n",
- tunnel->name, version, tunnel->version);
+ pr_warn_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n",
+ tunnel->name, version, tunnel->version);
goto error;
}
@@ -883,12 +833,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
length = skb->len;
/* If type is control packet, it is handled by userspace. */
- if (hdrflags & L2TP_HDRFLAG_T) {
- l2tp_dbg(tunnel, L2TP_MSG_DATA,
- "%s: recv control packet, len=%d\n",
- tunnel->name, length);
+ if (hdrflags & L2TP_HDRFLAG_T)
goto error;
- }
/* Skip flags */
ptr += 2;
@@ -917,9 +863,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
l2tp_session_dec_refcount(session);
/* Not found? Pass to userspace to deal with */
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: no session found (%u/%u). Passing up.\n",
- tunnel->name, tunnel_id, session_id);
+ pr_warn_ratelimited("%s: no session found (%u/%u). Passing up.\n",
+ tunnel->name, tunnel_id, session_id);
goto error;
}
@@ -949,12 +894,17 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
struct l2tp_tunnel *tunnel;
+ /* Note that this is called from the encap_rcv hook inside an
+ * RCU-protected region, but without the socket being locked.
+ * Hence we use rcu_dereference_sk_user_data to access the
+ * tunnel data structure rather the usual l2tp_sk_to_tunnel
+ * accessor function.
+ */
tunnel = rcu_dereference_sk_user_data(sk);
if (!tunnel)
goto pass_up;
-
- l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n",
- tunnel->name, skb->len);
+ if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC))
+ goto pass_up;
if (l2tp_udp_recv_core(tunnel, skb))
goto pass_up;
@@ -993,8 +943,7 @@ static int l2tp_build_l2tpv2_header(struct l2tp_session *session, void *buf)
*bufp++ = 0;
session->ns++;
session->ns &= 0xffff;
- l2tp_dbg(session, L2TP_MSG_SEQ, "%s: updated ns to %u\n",
- session->name, session->ns);
+ trace_session_seqnum_update(session);
}
return bufp - optr;
@@ -1030,9 +979,7 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
l2h = 0x40000000 | session->ns;
session->ns++;
session->ns &= 0xffffff;
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: updated ns to %u\n",
- session->name, session->ns);
+ trace_session_seqnum_update(session);
}
*((__be32 *)bufp) = htonl(l2h);
@@ -1042,74 +989,39 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
return bufp - optr;
}
-static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
- struct flowi *fl, size_t data_len)
+/* Queue the packet to IP for output: tunnel socket lock must be held */
+static int l2tp_xmit_queue(struct l2tp_tunnel *tunnel, struct sk_buff *skb, struct flowi *fl)
{
- struct l2tp_tunnel *tunnel = session->tunnel;
- unsigned int len = skb->len;
- int error;
-
- /* Debug */
- if (session->send_seq)
- l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n",
- session->name, data_len, session->ns - 1);
- else
- l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n",
- session->name, data_len);
-
- if (session->debug & L2TP_MSG_DATA) {
- int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
- unsigned char *datap = skb->data + uhlen;
-
- pr_debug("%s: xmit\n", session->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET,
- datap, min_t(size_t, 32, len - uhlen));
- }
+ int err;
- /* Queue the packet to IP for output */
skb->ignore_df = 1;
skb_dst_drop(skb);
#if IS_ENABLED(CONFIG_IPV6)
if (l2tp_sk_is_v6(tunnel->sock))
- error = inet6_csk_xmit(tunnel->sock, skb, NULL);
+ err = inet6_csk_xmit(tunnel->sock, skb, NULL);
else
#endif
- error = ip_queue_xmit(tunnel->sock, skb, fl);
+ err = ip_queue_xmit(tunnel->sock, skb, fl);
- /* Update stats */
- if (error >= 0) {
- atomic_long_inc(&tunnel->stats.tx_packets);
- atomic_long_add(len, &tunnel->stats.tx_bytes);
- atomic_long_inc(&session->stats.tx_packets);
- atomic_long_add(len, &session->stats.tx_bytes);
- } else {
- atomic_long_inc(&tunnel->stats.tx_errors);
- atomic_long_inc(&session->stats.tx_errors);
- }
+ return err >= 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
-/* If caller requires the skb to have a ppp header, the header must be
- * inserted in the skb data before calling this function.
- */
-int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len)
+static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, unsigned int *len)
{
- int data_len = skb->len;
struct l2tp_tunnel *tunnel = session->tunnel;
+ unsigned int data_len = skb->len;
struct sock *sk = tunnel->sock;
- struct flowi *fl;
- struct udphdr *uh;
- struct inet_sock *inet;
- int headroom;
- int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
- int udp_len;
+ int headroom, uhlen, udp_len;
int ret = NET_XMIT_SUCCESS;
+ struct inet_sock *inet;
+ struct udphdr *uh;
/* Check that there's enough headroom in the skb to insert IP,
* UDP and L2TP headers. If not enough, expand it to
* make room. Adjust truesize.
*/
- headroom = NET_SKB_PAD + sizeof(struct iphdr) +
- uhlen + hdr_len;
+ uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(*uh) : 0;
+ headroom = NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len;
if (skb_cow_head(skb, headroom)) {
kfree_skb(skb);
return NET_XMIT_DROP;
@@ -1117,14 +1029,13 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
/* Setup L2TP header */
if (tunnel->version == L2TP_HDR_VER_2)
- l2tp_build_l2tpv2_header(session, __skb_push(skb, hdr_len));
+ l2tp_build_l2tpv2_header(session, __skb_push(skb, session->hdr_len));
else
- l2tp_build_l2tpv3_header(session, __skb_push(skb, hdr_len));
+ l2tp_build_l2tpv3_header(session, __skb_push(skb, session->hdr_len));
/* Reset skb netfilter state */
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
+ IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED);
nf_reset_ct(skb);
bh_lock_sock(sk);
@@ -1143,8 +1054,12 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
goto out_unlock;
}
+ /* Report transmitted length before we add encap header, which keeps
+ * statistics consistent for both UDP and IP encap tx/rx paths.
+ */
+ *len = skb->len;
+
inet = inet_sk(sk);
- fl = &inet->cork.fl;
switch (tunnel->encap) {
case L2TP_ENCAPTYPE_UDP:
/* Setup UDP header */
@@ -1153,7 +1068,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
uh = udp_hdr(skb);
uh->source = inet->inet_sport;
uh->dest = inet->inet_dport;
- udp_len = uhlen + hdr_len + data_len;
+ udp_len = uhlen + session->hdr_len + data_len;
uh->len = htons(udp_len);
/* Calculate UDP checksum if configured to do so */
@@ -1172,12 +1087,34 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
break;
}
- l2tp_xmit_core(session, skb, fl, data_len);
+ ret = l2tp_xmit_queue(tunnel, skb, &inet->cork.fl);
+
out_unlock:
bh_unlock_sock(sk);
return ret;
}
+
+/* If caller requires the skb to have a ppp header, the header must be
+ * inserted in the skb data before calling this function.
+ */
+int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb)
+{
+ unsigned int len = 0;
+ int ret;
+
+ ret = l2tp_xmit_core(session, skb, &len);
+ if (ret == NET_XMIT_SUCCESS) {
+ atomic_long_inc(&session->tunnel->stats.tx_packets);
+ atomic_long_add(len, &session->tunnel->stats.tx_bytes);
+ atomic_long_inc(&session->stats.tx_packets);
+ atomic_long_add(len, &session->stats.tx_bytes);
+ } else {
+ atomic_long_inc(&session->tunnel->stats.tx_errors);
+ atomic_long_inc(&session->stats.tx_errors);
+ }
+ return ret;
+}
EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
/*****************************************************************************
@@ -1190,13 +1127,11 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
*/
static void l2tp_tunnel_destruct(struct sock *sk)
{
- struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
+ struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk);
if (!tunnel)
goto end;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name);
-
/* Disable udp encapsulation */
switch (tunnel->encap) {
case L2TP_ENCAPTYPE_UDP:
@@ -1255,34 +1190,16 @@ static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
struct hlist_node *tmp;
struct l2tp_session *session;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing all sessions...\n",
- tunnel->name);
-
write_lock_bh(&tunnel->hlist_lock);
tunnel->acpt_newsess = false;
for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
again:
hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) {
session = hlist_entry(walk, struct l2tp_session, hlist);
-
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: closing session\n", session->name);
-
hlist_del_init(&session->hlist);
- if (test_and_set_bit(0, &session->dead))
- goto again;
-
write_unlock_bh(&tunnel->hlist_lock);
-
- l2tp_session_unhash(session);
- l2tp_session_queue_purge(session);
-
- if (session->session_close)
- (*session->session_close)(session);
-
- l2tp_session_dec_refcount(session);
-
+ l2tp_session_delete(session);
write_lock_bh(&tunnel->hlist_lock);
/* Now restart from the beginning of this hash
@@ -1299,7 +1216,7 @@ again:
/* Tunnel socket destroy hook for UDP encapsulation */
static void l2tp_udp_encap_destroy(struct sock *sk)
{
- struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
+ struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk);
if (tunnel)
l2tp_tunnel_delete(tunnel);
@@ -1464,7 +1381,7 @@ out:
static struct lock_class_key l2tp_socket_class;
-int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id,
+int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id,
struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp)
{
struct l2tp_tunnel *tunnel = NULL;
@@ -1483,16 +1400,12 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
tunnel->version = version;
tunnel->tunnel_id = tunnel_id;
tunnel->peer_tunnel_id = peer_tunnel_id;
- tunnel->debug = L2TP_DEFAULT_DEBUG_FLAGS;
tunnel->magic = L2TP_TUNNEL_MAGIC;
sprintf(&tunnel->name[0], "tunl %u", tunnel_id);
rwlock_init(&tunnel->hlist_lock);
tunnel->acpt_newsess = true;
- if (cfg)
- tunnel->debug = cfg->debug;
-
tunnel->encap = encap;
refcount_set(&tunnel->ref_count, 1);
@@ -1597,6 +1510,8 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
"l2tp_sock");
sk->sk_allocation = GFP_ATOMIC;
+ trace_register_tunnel(tunnel);
+
if (tunnel->fd >= 0)
sockfd_put(sock);
@@ -1617,6 +1532,7 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_register);
void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
{
if (!test_and_set_bit(0, &tunnel->dead)) {
+ trace_delete_tunnel(tunnel);
l2tp_tunnel_inc_refcount(tunnel);
queue_work(l2tp_wq, &tunnel->del_work);
}
@@ -1628,6 +1544,7 @@ void l2tp_session_delete(struct l2tp_session *session)
if (test_and_set_bit(0, &session->dead))
return;
+ trace_delete_session(session);
l2tp_session_unhash(session);
l2tp_session_queue_purge(session);
if (session->session_close)
@@ -1686,12 +1603,8 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
INIT_HLIST_NODE(&session->hlist);
INIT_HLIST_NODE(&session->global_hlist);
- /* Inherit debug options from tunnel */
- session->debug = tunnel->debug;
-
if (cfg) {
session->pwtype = cfg->pw_type;
- session->debug = cfg->debug;
session->send_seq = cfg->send_seq;
session->recv_seq = cfg->recv_seq;
session->lns_mode = cfg->lns_mode;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 3468d6b177a0..cb21d906343e 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -51,7 +51,6 @@ struct l2tp_session_cfg {
unsigned int lns_mode:1; /* behave as LNS?
* LAC enables sequence numbers under LNS control.
*/
- int debug; /* bitmask of debug message categories */
u16 l2specific_type; /* Layer 2 specific type */
u8 cookie[8]; /* optional cookie */
int cookie_len; /* 0, 4 or 8 bytes */
@@ -66,6 +65,7 @@ struct l2tp_session_cfg {
* Is linked into a per-tunnel session hashlist; and in the case of an L2TPv3 session into
* an additional per-net ("global") hashlist.
*/
+#define L2TP_SESSION_NAME_MAX 32
struct l2tp_session {
int magic; /* should be L2TP_SESSION_MAGIC */
long dead;
@@ -90,14 +90,13 @@ struct l2tp_session {
struct hlist_node hlist; /* hash list node */
refcount_t ref_count;
- char name[32]; /* for logging */
+ char name[L2TP_SESSION_NAME_MAX]; /* for logging */
char ifname[IFNAMSIZ];
unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */
unsigned int send_seq:1; /* send packets with sequence numbers? */
unsigned int lns_mode:1; /* behave as LNS?
* LAC enables sequence numbers under LNS control.
*/
- int debug; /* bitmask of debug message categories */
int reorder_timeout; /* configured reorder timeout (in jiffies) */
int reorder_skip; /* set if skip to next nr */
enum l2tp_pwtype pwtype;
@@ -131,7 +130,6 @@ struct l2tp_session {
/* L2TP tunnel configuration */
struct l2tp_tunnel_cfg {
- int debug; /* bitmask of debug message categories */
enum l2tp_encap_type encap;
/* Used only for kernel-created sockets */
@@ -154,6 +152,7 @@ struct l2tp_tunnel_cfg {
* Maintains a hashlist of sessions belonging to the tunnel instance.
* Is linked into a per-net list of tunnels.
*/
+#define L2TP_TUNNEL_NAME_MAX 20
struct l2tp_tunnel {
int magic; /* Should be L2TP_TUNNEL_MAGIC */
@@ -170,8 +169,7 @@ struct l2tp_tunnel {
u32 peer_tunnel_id;
int version; /* 2=>L2TPv2, 3=>L2TPv3 */
- char name[20]; /* for logging */
- int debug; /* bitmask of debug message categories */
+ char name[L2TP_TUNNEL_NAME_MAX]; /* for logging */
enum l2tp_encap_type encap;
struct l2tp_stats stats;
@@ -237,7 +235,7 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net,
* Creation of a new instance is a two-step process: create, then register.
* Destruction is triggered using the *_delete functions, and completes asynchronously.
*/
-int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
+int l2tp_tunnel_create(int fd, int version, u32 tunnel_id,
u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg,
struct l2tp_tunnel **tunnelp);
int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
@@ -263,8 +261,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
/* Transmit path helpers for sending packets over the tunnel socket. */
void l2tp_session_set_header_len(struct l2tp_session *session, int version);
-int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb,
- int hdr_len);
+int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb);
/* Pseudowire management.
* Pseudowires should register with l2tp core on module init, and unregister
@@ -276,6 +273,11 @@ void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
/* IOCTL helper for IP encap modules. */
int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+/* Extract the tunnel structure from a socket's sk_user_data pointer,
+ * validating the tunnel magic feather.
+ */
+struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk);
+
static inline int l2tp_get_l2specific_len(struct l2tp_session *session)
{
switch (session->l2specific_type) {
@@ -337,19 +339,6 @@ static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, str
return 0;
}
-#define l2tp_printk(ptr, type, func, fmt, ...) \
-do { \
- if (((ptr)->debug) & (type)) \
- func(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define l2tp_warn(ptr, type, fmt, ...) \
- l2tp_printk(ptr, type, pr_warn, fmt, ##__VA_ARGS__)
-#define l2tp_info(ptr, type, fmt, ...) \
- l2tp_printk(ptr, type, pr_info, fmt, ##__VA_ARGS__)
-#define l2tp_dbg(ptr, type, fmt, ...) \
- l2tp_printk(ptr, type, pr_debug, fmt, ##__VA_ARGS__)
-
#define MODULE_ALIAS_L2TP_PWTYPE(type) \
MODULE_ALIAS("net-l2tp-type-" __stringify(type))
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 96cb9601c21b..bca75bef8282 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -167,7 +167,7 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
tunnel->sock ? refcount_read(&tunnel->sock->sk_refcnt) : 0,
refcount_read(&tunnel->ref_count));
seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n",
- tunnel->debug,
+ 0,
atomic_long_read(&tunnel->stats.tx_packets),
atomic_long_read(&tunnel->stats.tx_bytes),
atomic_long_read(&tunnel->stats.tx_errors),
@@ -192,7 +192,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
session->recv_seq ? 'R' : '-',
session->send_seq ? 'S' : '-',
session->lns_mode ? "LNS" : "LAC",
- session->debug,
+ 0,
jiffies_to_msecs(session->reorder_timeout));
seq_printf(m, " offset 0 l2specific %hu/%hu\n",
session->l2specific_type, l2tp_get_l2specific_len(session));
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 7ed2b4eced94..6cd97c75445c 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -76,7 +76,7 @@ static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev
struct l2tp_eth *priv = netdev_priv(dev);
struct l2tp_session *session = priv->session;
unsigned int len = skb->len;
- int ret = l2tp_xmit_skb(session, skb, session->hdr_len);
+ int ret = l2tp_xmit_skb(session, skb);
if (likely(ret == NET_XMIT_SUCCESS)) {
atomic_long_add(len, &priv->tx_bytes);
@@ -128,17 +128,6 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
struct net_device *dev;
struct l2tp_eth *priv;
- if (session->debug & L2TP_MSG_DATA) {
- unsigned int length;
-
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto error;
-
- pr_debug("%s: eth recv\n", session->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length);
- }
-
if (!pskb_may_pull(skb, ETH_HLEN))
goto error;
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index df2a35b5714a..97ae1255fcb6 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -118,7 +118,6 @@ static int l2tp_ip_recv(struct sk_buff *skb)
struct l2tp_session *session;
struct l2tp_tunnel *tunnel = NULL;
struct iphdr *iph;
- int length;
if (!pskb_may_pull(skb, 4))
goto discard;
@@ -147,20 +146,6 @@ static int l2tp_ip_recv(struct sk_buff *skb)
if (!tunnel)
goto discard_sess;
- /* Trace packet contents, if enabled */
- if (tunnel->debug & L2TP_MSG_DATA) {
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto discard_sess;
-
- /* Point to L2TP header */
- optr = skb->data;
- ptr = skb->data;
- ptr += 4;
- pr_debug("%s: ip recv\n", tunnel->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
- }
-
if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
goto discard_sess;
@@ -248,8 +233,8 @@ static void l2tp_ip_close(struct sock *sk, long timeout)
static void l2tp_ip_destroy_sock(struct sock *sk)
{
+ struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk);
struct sk_buff *skb;
- struct l2tp_tunnel *tunnel = sk->sk_user_data;
while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
kfree_skb(skb);
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index bc757bc7e264..96f975777438 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -131,7 +131,6 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
struct l2tp_session *session;
struct l2tp_tunnel *tunnel = NULL;
struct ipv6hdr *iph;
- int length;
if (!pskb_may_pull(skb, 4))
goto discard;
@@ -160,20 +159,6 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
if (!tunnel)
goto discard_sess;
- /* Trace packet contents, if enabled */
- if (tunnel->debug & L2TP_MSG_DATA) {
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto discard_sess;
-
- /* Point to L2TP header */
- optr = skb->data;
- ptr = skb->data;
- ptr += 4;
- pr_debug("%s: ip recv\n", tunnel->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
- }
-
if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
goto discard_sess;
@@ -262,7 +247,7 @@ static void l2tp_ip6_close(struct sock *sk, long timeout)
static void l2tp_ip6_destroy_sock(struct sock *sk)
{
- struct l2tp_tunnel *tunnel = sk->sk_user_data;
+ struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk);
lock_sock(sk);
ip6_flush_pending_frames(sk);
@@ -621,7 +606,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
if (ipc6.tclass < 0)
ipc6.tclass = np->tclass;
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index def78eebca4c..83956c9ee1fc 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -229,14 +229,11 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info
goto out;
}
- if (attrs[L2TP_ATTR_DEBUG])
- cfg.debug = nla_get_u32(attrs[L2TP_ATTR_DEBUG]);
-
ret = -EINVAL;
switch (cfg.encap) {
case L2TP_ENCAPTYPE_UDP:
case L2TP_ENCAPTYPE_IP:
- ret = l2tp_tunnel_create(net, fd, proto_version, tunnel_id,
+ ret = l2tp_tunnel_create(fd, proto_version, tunnel_id,
peer_tunnel_id, &cfg, &tunnel);
break;
}
@@ -307,9 +304,6 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info
goto out;
}
- if (info->attrs[L2TP_ATTR_DEBUG])
- tunnel->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
-
ret = l2tp_tunnel_notify(&l2tp_nl_family, info,
tunnel, L2TP_CMD_TUNNEL_MODIFY);
@@ -400,7 +394,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) ||
nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) ||
nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) ||
- nla_put_u32(skb, L2TP_ATTR_DEBUG, tunnel->debug) ||
+ nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) ||
nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap))
goto nla_put_failure;
@@ -426,6 +420,9 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
atomic_long_read(&tunnel->stats.rx_seq_discards),
L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS,
+ atomic_long_read(&tunnel->stats.rx_cookie_discards),
+ L2TP_ATTR_STATS_PAD) ||
nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS,
atomic_long_read(&tunnel->stats.rx_oos_packets),
L2TP_ATTR_STATS_PAD) ||
@@ -605,9 +602,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
}
- if (info->attrs[L2TP_ATTR_DEBUG])
- cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
-
if (info->attrs[L2TP_ATTR_RECV_SEQ])
cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
@@ -689,9 +683,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
goto out;
}
- if (info->attrs[L2TP_ATTR_DEBUG])
- session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
-
if (info->attrs[L2TP_ATTR_RECV_SEQ])
session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
@@ -730,7 +721,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) ||
nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) ||
nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) ||
- nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) ||
+ nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) ||
nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype))
goto nla_put_failure;
@@ -772,6 +763,9 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
atomic_long_read(&session->stats.rx_seq_discards),
L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS,
+ atomic_long_read(&session->stats.rx_cookie_discards),
+ L2TP_ATTR_STATS_PAD) ||
nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS,
atomic_long_read(&session->stats.rx_oos_packets),
L2TP_ATTR_STATS_PAD) ||
@@ -920,7 +914,7 @@ static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
},
};
-static const struct genl_ops l2tp_nl_ops[] = {
+static const struct genl_small_ops l2tp_nl_ops[] = {
{
.cmd = L2TP_CMD_NOOP,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -987,8 +981,8 @@ static struct genl_family l2tp_nl_family __ro_after_init = {
.policy = l2tp_nl_policy,
.netnsok = true,
.module = THIS_MODULE,
- .ops = l2tp_nl_ops,
- .n_ops = ARRAY_SIZE(l2tp_nl_ops),
+ .small_ops = l2tp_nl_ops,
+ .n_small_ops = ARRAY_SIZE(l2tp_nl_ops),
.mcgrps = l2tp_multicast_group,
.n_mcgrps = ARRAY_SIZE(l2tp_multicast_group),
};
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 13c3153b40d6..aea85f91f059 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -237,17 +237,9 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
if (sk->sk_state & PPPOX_BOUND) {
struct pppox_sock *po;
- l2tp_dbg(session, L2TP_MSG_DATA,
- "%s: recv %d byte data frame, passing to ppp\n",
- session->name, data_len);
-
po = pppox_sk(sk);
ppp_input(&po->chan, skb);
} else {
- l2tp_dbg(session, L2TP_MSG_DATA,
- "%s: recv %d byte data frame, passing to L2TP socket\n",
- session->name, data_len);
-
if (sock_queue_rcv_skb(sk, skb) < 0) {
atomic_long_inc(&session->stats.rx_errors);
kfree_skb(skb);
@@ -259,7 +251,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
no_sock:
rcu_read_unlock();
- l2tp_info(session, L2TP_MSG_DATA, "%s: no socket\n", session->name);
+ pr_warn_ratelimited("%s: no socket in recv\n", session->name);
kfree_skb(skb);
}
@@ -324,7 +316,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
}
local_bh_disable();
- l2tp_xmit_skb(session, skb, session->hdr_len);
+ l2tp_xmit_skb(session, skb);
local_bh_enable();
sock_put(sk);
@@ -383,7 +375,7 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
skb->data[1] = PPP_UI;
local_bh_disable();
- l2tp_xmit_skb(session, skb, session->hdr_len);
+ l2tp_xmit_skb(session, skb);
local_bh_enable();
sock_put(sk);
@@ -710,7 +702,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
if (!tunnel) {
struct l2tp_tunnel_cfg tcfg = {
.encap = L2TP_ENCAPTYPE_UDP,
- .debug = 0,
};
/* Prevent l2tp_tunnel_register() from trying to set up
@@ -721,7 +712,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
goto end;
}
- error = l2tp_tunnel_create(sock_net(sk), info.fd,
+ error = l2tp_tunnel_create(info.fd,
info.version,
info.tunnel_id,
info.peer_tunnel_id, &tcfg,
@@ -840,8 +831,6 @@ out_no_ppp:
drop_refcnt = false;
sk->sk_state = PPPOX_CONNECTED;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n",
- session->name);
end:
if (error) {
@@ -1076,6 +1065,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
if (!session)
return -ENOTCONN;
+ if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
+ return -EBADF;
+
/* Not defined for tunnels */
if (!session->session_id && !session->peer_session_id)
return -ENOSYS;
@@ -1090,6 +1082,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
if (!session)
return -ENOTCONN;
+ if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
+ return -EBADF;
+
/* Not defined for tunnels */
if (!session->session_id && !session->peer_session_id)
return -ENOSYS;
@@ -1103,6 +1098,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
if (!session)
return -ENOTCONN;
+ if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
+ return -EBADF;
+
/* Session 0 represents the parent tunnel */
if (!session->session_id && !session->peer_session_id) {
u32 session_id;
@@ -1157,9 +1155,7 @@ static int pppol2tp_tunnel_setsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_DEBUG:
- tunnel->debug = val;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
- tunnel->name, tunnel->debug);
+ /* Tunnel debug flags option is deprecated */
break;
default:
@@ -1185,9 +1181,6 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
break;
}
session->recv_seq = !!val;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set recv_seq=%d\n",
- session->name, session->recv_seq);
break;
case PPPOL2TP_SO_SENDSEQ:
@@ -1203,9 +1196,6 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
}
l2tp_session_set_header_len(session, session->tunnel->version);
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set send_seq=%d\n",
- session->name, session->send_seq);
break;
case PPPOL2TP_SO_LNSMODE:
@@ -1214,22 +1204,14 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
break;
}
session->lns_mode = !!val;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set lns_mode=%d\n",
- session->name, session->lns_mode);
break;
case PPPOL2TP_SO_DEBUG:
- session->debug = val;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
- session->name, session->debug);
+ /* Session debug flags option is deprecated */
break;
case PPPOL2TP_SO_REORDERTO:
session->reorder_timeout = msecs_to_jiffies(val);
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set reorder_timeout=%d\n",
- session->name, session->reorder_timeout);
break;
default:
@@ -1297,9 +1279,8 @@ static int pppol2tp_tunnel_getsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_DEBUG:
- *val = tunnel->debug;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get debug=%x\n",
- tunnel->name, tunnel->debug);
+ /* Tunnel debug flags option is deprecated */
+ *val = 0;
break;
default:
@@ -1321,32 +1302,23 @@ static int pppol2tp_session_getsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_RECVSEQ:
*val = session->recv_seq;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get recv_seq=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_SENDSEQ:
*val = session->send_seq;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get send_seq=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_LNSMODE:
*val = session->lns_mode;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get lns_mode=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_DEBUG:
- *val = session->debug;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: get debug=%d\n",
- session->name, *val);
+ /* Session debug flags option is deprecated */
+ *val = 0;
break;
case PPPOL2TP_SO_REORDERTO:
*val = (int)jiffies_to_msecs(session->reorder_timeout);
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get reorder_timeout=%d\n", session->name, *val);
break;
default:
@@ -1534,7 +1506,7 @@ static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v)
(tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N',
refcount_read(&tunnel->ref_count) - 1);
seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n",
- tunnel->debug,
+ 0,
atomic_long_read(&tunnel->stats.tx_packets),
atomic_long_read(&tunnel->stats.tx_bytes),
atomic_long_read(&tunnel->stats.tx_errors),
@@ -1580,7 +1552,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
session->recv_seq ? 'R' : '-',
session->send_seq ? 'S' : '-',
session->lns_mode ? "LNS" : "LAC",
- session->debug,
+ 0,
jiffies_to_msecs(session->reorder_timeout));
seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n",
session->nr, session->ns,
diff --git a/net/l2tp/trace.h b/net/l2tp/trace.h
new file mode 100644
index 000000000000..8596eaa12a2e
--- /dev/null
+++ b/net/l2tp/trace.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM l2tp
+
+#if !defined(_TRACE_L2TP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_L2TP_H
+
+#include <linux/tracepoint.h>
+#include <linux/l2tp.h>
+#include "l2tp_core.h"
+
+#define encap_type_name(e) { L2TP_ENCAPTYPE_##e, #e }
+#define show_encap_type_name(val) \
+ __print_symbolic(val, \
+ encap_type_name(UDP), \
+ encap_type_name(IP))
+
+#define pw_type_name(p) { L2TP_PWTYPE_##p, #p }
+#define show_pw_type_name(val) \
+ __print_symbolic(val, \
+ pw_type_name(ETH_VLAN), \
+ pw_type_name(ETH), \
+ pw_type_name(PPP), \
+ pw_type_name(PPP_AC), \
+ pw_type_name(IP))
+
+DECLARE_EVENT_CLASS(tunnel_only_evt,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_TUNNEL_NAME_MAX)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX);
+ ),
+ TP_printk("%s", __entry->name)
+);
+
+DECLARE_EVENT_CLASS(session_only_evt,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ ),
+ TP_printk("%s", __entry->name)
+);
+
+TRACE_EVENT(register_tunnel,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_TUNNEL_NAME_MAX)
+ __field(int, fd)
+ __field(u32, tid)
+ __field(u32, ptid)
+ __field(int, version)
+ __field(enum l2tp_encap_type, encap)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX);
+ __entry->fd = tunnel->fd;
+ __entry->tid = tunnel->tunnel_id;
+ __entry->ptid = tunnel->peer_tunnel_id;
+ __entry->version = tunnel->version;
+ __entry->encap = tunnel->encap;
+ ),
+ TP_printk("%s: type=%s encap=%s version=L2TPv%d tid=%u ptid=%u fd=%d",
+ __entry->name,
+ __entry->fd > 0 ? "managed" : "unmanaged",
+ show_encap_type_name(__entry->encap),
+ __entry->version,
+ __entry->tid,
+ __entry->ptid,
+ __entry->fd)
+);
+
+DEFINE_EVENT(tunnel_only_evt, delete_tunnel,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel)
+);
+
+DEFINE_EVENT(tunnel_only_evt, free_tunnel,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel)
+);
+
+TRACE_EVENT(register_session,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ __field(u32, tid)
+ __field(u32, ptid)
+ __field(u32, sid)
+ __field(u32, psid)
+ __field(enum l2tp_pwtype, pwtype)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ __entry->tid = session->tunnel ? session->tunnel->tunnel_id : 0;
+ __entry->ptid = session->tunnel ? session->tunnel->peer_tunnel_id : 0;
+ __entry->sid = session->session_id;
+ __entry->psid = session->peer_session_id;
+ __entry->pwtype = session->pwtype;
+ ),
+ TP_printk("%s: pseudowire=%s sid=%u psid=%u tid=%u ptid=%u",
+ __entry->name,
+ show_pw_type_name(__entry->pwtype),
+ __entry->sid,
+ __entry->psid,
+ __entry->sid,
+ __entry->psid)
+);
+
+DEFINE_EVENT(session_only_evt, delete_session,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_only_evt, free_session,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_only_evt, session_seqnum_lns_enable,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_only_evt, session_seqnum_lns_disable,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DECLARE_EVENT_CLASS(session_seqnum_evt,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ __field(u32, ns)
+ __field(u32, nr)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ __entry->ns = session->ns;
+ __entry->nr = session->nr;
+ ),
+ TP_printk("%s: ns=%u nr=%u",
+ __entry->name,
+ __entry->ns,
+ __entry->nr)
+);
+
+DEFINE_EVENT(session_seqnum_evt, session_seqnum_update,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_seqnum_evt, session_seqnum_reset,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DECLARE_EVENT_CLASS(session_pkt_discard_evt,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ __field(u32, pkt_ns)
+ __field(u32, my_nr)
+ __field(u32, reorder_q_len)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ __entry->pkt_ns = pkt_ns,
+ __entry->my_nr = session->nr;
+ __entry->reorder_q_len = skb_queue_len(&session->reorder_q);
+ ),
+ TP_printk("%s: pkt_ns=%u my_nr=%u reorder_q_len=%u",
+ __entry->name,
+ __entry->pkt_ns,
+ __entry->my_nr,
+ __entry->reorder_q_len)
+);
+
+DEFINE_EVENT(session_pkt_discard_evt, session_pkt_expired,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns)
+);
+
+DEFINE_EVENT(session_pkt_discard_evt, session_pkt_outside_rx_window,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns)
+);
+
+DEFINE_EVENT(session_pkt_discard_evt, session_pkt_oos,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns)
+);
+
+#endif /* _TRACE_L2TP_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 864326f150e2..ad7730b68772 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -241,6 +241,7 @@ EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
* L3 master device
* @net: network namespace for device index lookup
* @fl: flow struct
+ * @arg: store the table the rule matched with here
*/
int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index 3c03f6512c5f..0511bbe4af7b 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -122,6 +122,8 @@ static struct lapb_cb *lapb_create_cb(void)
timer_setup(&lapb->t1timer, NULL, 0);
timer_setup(&lapb->t2timer, NULL, 0);
+ lapb->t1timer_stop = true;
+ lapb->t2timer_stop = true;
lapb->t1 = LAPB_DEFAULT_T1;
lapb->t2 = LAPB_DEFAULT_T2;
@@ -129,6 +131,8 @@ static struct lapb_cb *lapb_create_cb(void)
lapb->mode = LAPB_DEFAULT_MODE;
lapb->window = LAPB_DEFAULT_WINDOW;
lapb->state = LAPB_STATE_0;
+
+ spin_lock_init(&lapb->lock);
refcount_set(&lapb->refcnt, 1);
out:
return lapb;
@@ -178,11 +182,23 @@ int lapb_unregister(struct net_device *dev)
goto out;
lapb_put(lapb);
+ /* Wait for other refs to "lapb" to drop */
+ while (refcount_read(&lapb->refcnt) > 2)
+ usleep_range(1, 10);
+
+ spin_lock_bh(&lapb->lock);
+
lapb_stop_t1timer(lapb);
lapb_stop_t2timer(lapb);
lapb_clear_queues(lapb);
+ spin_unlock_bh(&lapb->lock);
+
+ /* Wait for running timers to stop */
+ del_timer_sync(&lapb->t1timer);
+ del_timer_sync(&lapb->t2timer);
+
__lapb_remove_cb(lapb);
lapb_put(lapb);
@@ -201,6 +217,8 @@ int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms)
if (!lapb)
goto out;
+ spin_lock_bh(&lapb->lock);
+
parms->t1 = lapb->t1 / HZ;
parms->t2 = lapb->t2 / HZ;
parms->n2 = lapb->n2;
@@ -219,6 +237,7 @@ int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms)
else
parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ;
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
rc = LAPB_OK;
out:
@@ -234,6 +253,8 @@ int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms)
if (!lapb)
goto out;
+ spin_lock_bh(&lapb->lock);
+
rc = LAPB_INVALUE;
if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1)
goto out_put;
@@ -256,6 +277,7 @@ int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms)
rc = LAPB_OK;
out_put:
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
out:
return rc;
@@ -270,6 +292,8 @@ int lapb_connect_request(struct net_device *dev)
if (!lapb)
goto out;
+ spin_lock_bh(&lapb->lock);
+
rc = LAPB_OK;
if (lapb->state == LAPB_STATE_1)
goto out_put;
@@ -285,24 +309,18 @@ int lapb_connect_request(struct net_device *dev)
rc = LAPB_OK;
out_put:
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
out:
return rc;
}
EXPORT_SYMBOL(lapb_connect_request);
-int lapb_disconnect_request(struct net_device *dev)
+static int __lapb_disconnect_request(struct lapb_cb *lapb)
{
- struct lapb_cb *lapb = lapb_devtostruct(dev);
- int rc = LAPB_BADTOKEN;
-
- if (!lapb)
- goto out;
-
switch (lapb->state) {
case LAPB_STATE_0:
- rc = LAPB_NOTCONNECTED;
- goto out_put;
+ return LAPB_NOTCONNECTED;
case LAPB_STATE_1:
lapb_dbg(1, "(%p) S1 TX DISC(1)\n", lapb->dev);
@@ -310,12 +328,10 @@ int lapb_disconnect_request(struct net_device *dev)
lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND);
lapb->state = LAPB_STATE_0;
lapb_start_t1timer(lapb);
- rc = LAPB_NOTCONNECTED;
- goto out_put;
+ return LAPB_NOTCONNECTED;
case LAPB_STATE_2:
- rc = LAPB_OK;
- goto out_put;
+ return LAPB_OK;
}
lapb_clear_queues(lapb);
@@ -328,8 +344,22 @@ int lapb_disconnect_request(struct net_device *dev)
lapb_dbg(1, "(%p) S3 DISC(1)\n", lapb->dev);
lapb_dbg(0, "(%p) S3 -> S2\n", lapb->dev);
- rc = LAPB_OK;
-out_put:
+ return LAPB_OK;
+}
+
+int lapb_disconnect_request(struct net_device *dev)
+{
+ struct lapb_cb *lapb = lapb_devtostruct(dev);
+ int rc = LAPB_BADTOKEN;
+
+ if (!lapb)
+ goto out;
+
+ spin_lock_bh(&lapb->lock);
+
+ rc = __lapb_disconnect_request(lapb);
+
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
out:
return rc;
@@ -344,6 +374,8 @@ int lapb_data_request(struct net_device *dev, struct sk_buff *skb)
if (!lapb)
goto out;
+ spin_lock_bh(&lapb->lock);
+
rc = LAPB_NOTCONNECTED;
if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4)
goto out_put;
@@ -352,6 +384,7 @@ int lapb_data_request(struct net_device *dev, struct sk_buff *skb)
lapb_kick(lapb);
rc = LAPB_OK;
out_put:
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
out:
return rc;
@@ -364,7 +397,9 @@ int lapb_data_received(struct net_device *dev, struct sk_buff *skb)
int rc = LAPB_BADTOKEN;
if (lapb) {
+ spin_lock_bh(&lapb->lock);
lapb_data_input(lapb, skb);
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
rc = LAPB_OK;
}
@@ -418,14 +453,98 @@ int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb)
return used;
}
+/* Handle device status changes. */
+static int lapb_device_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct lapb_cb *lapb;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (dev->type != ARPHRD_X25)
+ return NOTIFY_DONE;
+
+ lapb = lapb_devtostruct(dev);
+ if (!lapb)
+ return NOTIFY_DONE;
+
+ spin_lock_bh(&lapb->lock);
+
+ switch (event) {
+ case NETDEV_UP:
+ lapb_dbg(0, "(%p) Interface up: %s\n", dev, dev->name);
+
+ if (netif_carrier_ok(dev)) {
+ lapb_dbg(0, "(%p): Carrier is already up: %s\n", dev,
+ dev->name);
+ if (lapb->mode & LAPB_DCE) {
+ lapb_start_t1timer(lapb);
+ } else {
+ if (lapb->state == LAPB_STATE_0) {
+ lapb->state = LAPB_STATE_1;
+ lapb_establish_data_link(lapb);
+ }
+ }
+ }
+ break;
+ case NETDEV_GOING_DOWN:
+ if (netif_carrier_ok(dev))
+ __lapb_disconnect_request(lapb);
+ break;
+ case NETDEV_DOWN:
+ lapb_dbg(0, "(%p) Interface down: %s\n", dev, dev->name);
+ lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state);
+ lapb_clear_queues(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb->n2count = 0;
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ break;
+ case NETDEV_CHANGE:
+ if (netif_carrier_ok(dev)) {
+ lapb_dbg(0, "(%p): Carrier detected: %s\n", dev,
+ dev->name);
+ if (lapb->mode & LAPB_DCE) {
+ lapb_start_t1timer(lapb);
+ } else {
+ if (lapb->state == LAPB_STATE_0) {
+ lapb->state = LAPB_STATE_1;
+ lapb_establish_data_link(lapb);
+ }
+ }
+ } else {
+ lapb_dbg(0, "(%p) Carrier lost: %s\n", dev, dev->name);
+ lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state);
+ lapb_clear_queues(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb->n2count = 0;
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ }
+ break;
+ }
+
+ spin_unlock_bh(&lapb->lock);
+ lapb_put(lapb);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lapb_dev_notifier = {
+ .notifier_call = lapb_device_event,
+};
+
static int __init lapb_init(void)
{
- return 0;
+ return register_netdevice_notifier(&lapb_dev_notifier);
}
static void __exit lapb_exit(void)
{
WARN_ON(!list_empty(&lapb_list));
+
+ unregister_netdevice_notifier(&lapb_dev_notifier);
}
MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>");
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
index 7a4d0715d1c3..a966d29c772d 100644
--- a/net/lapb/lapb_out.c
+++ b/net/lapb/lapb_out.c
@@ -82,7 +82,8 @@ void lapb_kick(struct lapb_cb *lapb)
skb = skb_dequeue(&lapb->write_queue);
do {
- if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+ skbn = skb_copy(skb, GFP_ATOMIC);
+ if (!skbn) {
skb_queue_head(&lapb->write_queue, skb);
break;
}
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 8f5b17001a07..0230b272b7d1 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -40,6 +40,7 @@ void lapb_start_t1timer(struct lapb_cb *lapb)
lapb->t1timer.function = lapb_t1timer_expiry;
lapb->t1timer.expires = jiffies + lapb->t1;
+ lapb->t1timer_stop = false;
add_timer(&lapb->t1timer);
}
@@ -50,16 +51,19 @@ void lapb_start_t2timer(struct lapb_cb *lapb)
lapb->t2timer.function = lapb_t2timer_expiry;
lapb->t2timer.expires = jiffies + lapb->t2;
+ lapb->t2timer_stop = false;
add_timer(&lapb->t2timer);
}
void lapb_stop_t1timer(struct lapb_cb *lapb)
{
+ lapb->t1timer_stop = true;
del_timer(&lapb->t1timer);
}
void lapb_stop_t2timer(struct lapb_cb *lapb)
{
+ lapb->t2timer_stop = true;
del_timer(&lapb->t2timer);
}
@@ -72,24 +76,46 @@ static void lapb_t2timer_expiry(struct timer_list *t)
{
struct lapb_cb *lapb = from_timer(lapb, t, t2timer);
+ spin_lock_bh(&lapb->lock);
+ if (timer_pending(&lapb->t2timer)) /* A new timer has been set up */
+ goto out;
+ if (lapb->t2timer_stop) /* The timer has been stopped */
+ goto out;
+
if (lapb->condition & LAPB_ACK_PENDING_CONDITION) {
lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
lapb_timeout_response(lapb);
}
+
+out:
+ spin_unlock_bh(&lapb->lock);
}
static void lapb_t1timer_expiry(struct timer_list *t)
{
struct lapb_cb *lapb = from_timer(lapb, t, t1timer);
+ spin_lock_bh(&lapb->lock);
+ if (timer_pending(&lapb->t1timer)) /* A new timer has been set up */
+ goto out;
+ if (lapb->t1timer_stop) /* The timer has been stopped */
+ goto out;
+
switch (lapb->state) {
/*
- * If we are a DCE, keep going DM .. DM .. DM
+ * If we are a DCE, send DM up to N2 times, then switch to
+ * STATE_1 and send SABM(E).
*/
case LAPB_STATE_0:
- if (lapb->mode & LAPB_DCE)
+ if (lapb->mode & LAPB_DCE &&
+ lapb->n2count != lapb->n2) {
+ lapb->n2count++;
lapb_send_control(lapb, LAPB_DM, LAPB_POLLOFF, LAPB_RESPONSE);
+ } else {
+ lapb->state = LAPB_STATE_1;
+ lapb_establish_data_link(lapb);
+ }
break;
/*
@@ -101,7 +127,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb->state = LAPB_STATE_0;
lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev);
- return;
+ goto out;
} else {
lapb->n2count++;
if (lapb->mode & LAPB_EXTENDED) {
@@ -125,7 +151,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb->state = LAPB_STATE_0;
lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev);
- return;
+ goto out;
} else {
lapb->n2count++;
lapb_dbg(1, "(%p) S2 TX DISC(1)\n", lapb->dev);
@@ -143,7 +169,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb_stop_t2timer(lapb);
lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev);
- return;
+ goto out;
} else {
lapb->n2count++;
lapb_requeue_frames(lapb);
@@ -160,7 +186,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb->state = LAPB_STATE_0;
lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S4 -> S0\n", lapb->dev);
- return;
+ goto out;
} else {
lapb->n2count++;
lapb_transmit_frmr(lapb);
@@ -169,4 +195,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
}
lapb_start_t1timer(lapb);
+
+out:
+ spin_unlock_bh(&lapb->lock);
}
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 1144cda2a0fc..912aa9bd5e29 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -909,6 +909,8 @@ static void llc_sk_init(struct sock *sk)
* @net: network namespace
* @family: upper layer protocol family
* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @prot: struct proto associated with this new sock instance
+ * @kern: is this to be a kernel socket?
*
* Allocates a LLC sock and initializes it. Returns the new LLC sock
* or %NULL if there's no memory available for one
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index cd9a9bd242ba..51ec8256b7fa 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -69,7 +69,7 @@ config MAC80211_MESH
config MAC80211_LEDS
bool "Enable LED triggers"
depends on MAC80211
- depends on LEDS_CLASS
+ depends on LEDS_CLASS=y || LEDS_CLASS=MAC80211
select LEDS_TRIGGERS
help
This option enables a few LED triggers for different
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 6cbb1286d6c0..ad04c361cba5 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -13,6 +13,7 @@ mac80211-y := \
ht.o agg-tx.o agg-rx.o \
vht.o \
he.o \
+ s1g.o \
ibss.o \
iface.o \
rate.o \
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 313ba97acae3..cce28e3b2232 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -250,10 +250,10 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid,
mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP;
mgmt->u.action.u.addba_resp.dialog_token = dialog_token;
- capab = (u16)(amsdu << 0); /* bit 0 A-MSDU support */
- capab |= (u16)(policy << 1); /* bit 1 aggregation policy */
- capab |= (u16)(tid << 2); /* bit 5:2 TID number */
- capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */
+ capab = u16_encode_bits(amsdu, IEEE80211_ADDBA_PARAM_AMSDU_MASK);
+ capab |= u16_encode_bits(policy, IEEE80211_ADDBA_PARAM_POLICY_MASK);
+ capab |= u16_encode_bits(tid, IEEE80211_ADDBA_PARAM_TID_MASK);
+ capab |= u16_encode_bits(buf_size, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK);
mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab);
mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
@@ -350,7 +350,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
sta->sta.addr, tid);
/* We have no API to update the timeout value in the
* driver so reject the timeout update if the timeout
- * changed. If if did not change, i.e., no real update,
+ * changed. If it did not change, i.e., no real update,
* just reply with success.
*/
rcu_read_lock();
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index b37c8a983d88..430a58587538 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -95,10 +95,10 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ;
mgmt->u.action.u.addba_req.dialog_token = dialog_token;
- capab = (u16)(1 << 0); /* bit 0 A-MSDU support */
- capab |= (u16)(1 << 1); /* bit 1 aggregation policy */
- capab |= (u16)(tid << 2); /* bit 5:2 TID number */
- capab |= (u16)(agg_size << 6); /* bit 15:6 max size of aggergation */
+ capab = IEEE80211_ADDBA_PARAM_AMSDU_MASK;
+ capab |= IEEE80211_ADDBA_PARAM_POLICY_MASK;
+ capab |= u16_encode_bits(tid, IEEE80211_ADDBA_PARAM_TID_MASK);
+ capab |= u16_encode_bits(agg_size, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK);
mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab);
@@ -950,8 +950,8 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK;
- tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
- buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+ tid = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_TID_MASK);
+ buf_size = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK);
buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
txq = sta->sta.txq[tid];
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 87fddd84c621..c4c70e30ad7f 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -405,6 +405,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
case WLAN_CIPHER_SUITE_WEP104:
if (WARN_ON_ONCE(fips_enabled))
return -EINVAL;
+ break;
case WLAN_CIPHER_SUITE_CCMP:
case WLAN_CIPHER_SUITE_CCMP_256:
case WLAN_CIPHER_SUITE_AES_CMAC:
@@ -709,7 +710,8 @@ void sta_set_rate_info_tx(struct sta_info *sta,
u16 brate;
sband = ieee80211_get_sband(sta->sdata);
- if (sband) {
+ WARN_ON_ONCE(sband && !sband->bitrates);
+ if (sband && sband->bitrates) {
brate = sband->bitrates[rate->idx].bitrate;
rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
}
@@ -826,9 +828,9 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
memcpy(new->data, resp, resp_len);
if (csa)
- memcpy(new->csa_counter_offsets, csa->counter_offsets_presp,
+ memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp,
csa->n_counter_offsets_presp *
- sizeof(new->csa_counter_offsets[0]));
+ sizeof(new->cntdwn_counter_offsets[0]));
rcu_assign_pointer(sdata->u.ap.probe_resp, new);
if (old)
@@ -837,6 +839,59 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
return 0;
}
+static int ieee80211_set_fils_discovery(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_fils_discovery *params)
+{
+ struct fils_discovery_data *new, *old = NULL;
+ struct ieee80211_fils_discovery *fd;
+
+ if (!params->tmpl || !params->tmpl_len)
+ return -EINVAL;
+
+ fd = &sdata->vif.bss_conf.fils_discovery;
+ fd->min_interval = params->min_interval;
+ fd->max_interval = params->max_interval;
+
+ old = sdata_dereference(sdata->u.ap.fils_discovery, sdata);
+ new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+ new->len = params->tmpl_len;
+ memcpy(new->data, params->tmpl, params->tmpl_len);
+ rcu_assign_pointer(sdata->u.ap.fils_discovery, new);
+
+ if (old)
+ kfree_rcu(old, rcu_head);
+
+ return 0;
+}
+
+static int
+ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_unsol_bcast_probe_resp *params)
+{
+ struct unsol_bcast_probe_resp_data *new, *old = NULL;
+
+ if (!params->tmpl || !params->tmpl_len)
+ return -EINVAL;
+
+ old = sdata_dereference(sdata->u.ap.unsol_bcast_probe_resp, sdata);
+ new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+ new->len = params->tmpl_len;
+ memcpy(new->data, params->tmpl, params->tmpl_len);
+ rcu_assign_pointer(sdata->u.ap.unsol_bcast_probe_resp, new);
+
+ if (old)
+ kfree_rcu(old, rcu_head);
+
+ sdata->vif.bss_conf.unsol_bcast_probe_resp_interval =
+ params->interval;
+
+ return 0;
+}
+
static int ieee80211_set_ftm_responder_params(
struct ieee80211_sub_if_data *sdata,
const u8 *lci, size_t lci_len,
@@ -926,10 +981,10 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
new->tail_len = new_tail_len;
if (csa) {
- new->csa_current_counter = csa->count;
- memcpy(new->csa_counter_offsets, csa->counter_offsets_beacon,
+ new->cntdwn_current_counter = csa->count;
+ memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon,
csa->n_counter_offsets_beacon *
- sizeof(new->csa_counter_offsets[0]));
+ sizeof(new->cntdwn_counter_offsets[0]));
}
/* copy in head */
@@ -1067,10 +1122,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
sdata->vif.bss_conf.enable_beacon = true;
sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p;
sdata->vif.bss_conf.twt_responder = params->twt_responder;
- memcpy(&sdata->vif.bss_conf.he_obss_pd, &params->he_obss_pd,
- sizeof(struct ieee80211_he_obss_pd));
- memcpy(&sdata->vif.bss_conf.he_bss_color, &params->he_bss_color,
- sizeof(struct ieee80211_he_bss_color));
+ sdata->vif.bss_conf.he_obss_pd = params->he_obss_pd;
+ sdata->vif.bss_conf.he_bss_color = params->he_bss_color;
+ sdata->vif.bss_conf.s1g = params->chandef.chan->band ==
+ NL80211_BAND_S1GHZ;
sdata->vif.bss_conf.ssid_len = params->ssid_len;
if (params->ssid_len)
@@ -1098,13 +1153,30 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
}
}
+ if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL))
+ sdata->vif.bss_conf.beacon_tx_rate = params->beacon_rate;
+
err = ieee80211_assign_beacon(sdata, &params->beacon, NULL);
- if (err < 0) {
- ieee80211_vif_release_channel(sdata);
- return err;
- }
+ if (err < 0)
+ goto error;
changed |= err;
+ if (params->fils_discovery.max_interval) {
+ err = ieee80211_set_fils_discovery(sdata,
+ &params->fils_discovery);
+ if (err < 0)
+ goto error;
+ changed |= BSS_CHANGED_FILS_DISCOVERY;
+ }
+
+ if (params->unsol_bcast_probe_resp.interval) {
+ err = ieee80211_set_unsol_bcast_probe_resp(sdata,
+ &params->unsol_bcast_probe_resp);
+ if (err < 0)
+ goto error;
+ changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP;
+ }
+
err = drv_start_ap(sdata->local, sdata);
if (err) {
old = sdata_dereference(sdata->u.ap.beacon, sdata);
@@ -1112,8 +1184,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
if (old)
kfree_rcu(old, rcu_head);
RCU_INIT_POINTER(sdata->u.ap.beacon, NULL);
- ieee80211_vif_release_channel(sdata);
- return err;
+ goto error;
}
ieee80211_recalc_dtim(local, sdata);
@@ -1124,6 +1195,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
netif_carrier_on(vlan->dev);
return 0;
+
+error:
+ ieee80211_vif_release_channel(sdata);
+ return err;
}
static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
@@ -1160,6 +1235,8 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
struct ieee80211_local *local = sdata->local;
struct beacon_data *old_beacon;
struct probe_resp *old_probe_resp;
+ struct fils_discovery_data *old_fils_discovery;
+ struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp;
struct cfg80211_chan_def chandef;
sdata_assert_lock(sdata);
@@ -1168,6 +1245,11 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
if (!old_beacon)
return -ENOENT;
old_probe_resp = sdata_dereference(sdata->u.ap.probe_resp, sdata);
+ old_fils_discovery = sdata_dereference(sdata->u.ap.fils_discovery,
+ sdata);
+ old_unsol_bcast_probe_resp =
+ sdata_dereference(sdata->u.ap.unsol_bcast_probe_resp,
+ sdata);
/* abort any running channel switch */
mutex_lock(&local->mtx);
@@ -1191,9 +1273,15 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
/* remove beacon and probe response */
RCU_INIT_POINTER(sdata->u.ap.beacon, NULL);
RCU_INIT_POINTER(sdata->u.ap.probe_resp, NULL);
+ RCU_INIT_POINTER(sdata->u.ap.fils_discovery, NULL);
+ RCU_INIT_POINTER(sdata->u.ap.unsol_bcast_probe_resp, NULL);
kfree_rcu(old_beacon, rcu_head);
if (old_probe_resp)
kfree_rcu(old_probe_resp, rcu_head);
+ if (old_fils_discovery)
+ kfree_rcu(old_fils_discovery, rcu_head);
+ if (old_unsol_bcast_probe_resp)
+ kfree_rcu(old_unsol_bcast_probe_resp, rcu_head);
kfree(sdata->vif.bss_conf.ftmr_params);
sdata->vif.bss_conf.ftmr_params = NULL;
@@ -1696,6 +1784,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
__ieee80211_check_fast_rx_iface(vlansdata);
+ drv_sta_set_4addr(local, sta->sdata, &sta->sta, true);
}
if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
@@ -2618,16 +2707,6 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy,
return 0;
}
-static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev,
- const u8 *addr)
-{
- struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
- memcpy(&sdata->u.wds.remote_addr, addr, ETH_ALEN);
-
- return 0;
-}
-
static void ieee80211_rfkill_poll(struct wiphy *wiphy)
{
struct ieee80211_local *local = wiphy_priv(wiphy);
@@ -3186,9 +3265,9 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
break;
if ((params->n_counter_offsets_beacon >
- IEEE80211_MAX_CSA_COUNTERS_NUM) ||
+ IEEE80211_MAX_CNTDWN_COUNTERS_NUM) ||
(params->n_counter_offsets_presp >
- IEEE80211_MAX_CSA_COUNTERS_NUM))
+ IEEE80211_MAX_CNTDWN_COUNTERS_NUM))
return -EINVAL;
csa.counter_offsets_beacon = params->counter_offsets_beacon;
@@ -3217,6 +3296,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
if (cfg80211_get_chandef_type(&params->chandef) !=
cfg80211_get_chandef_type(&sdata->u.ibss.chandef))
return -EINVAL;
+ break;
case NL80211_CHAN_WIDTH_5:
case NL80211_CHAN_WIDTH_10:
case NL80211_CHAN_WIDTH_20_NOHT:
@@ -3368,7 +3448,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
IEEE80211_QUEUE_STOP_REASON_CSA);
cfg80211_ch_switch_started_notify(sdata->dev, &sdata->csa_chandef,
- params->count);
+ params->count, params->block_tx);
if (changed) {
ieee80211_bss_info_change_notify(sdata, changed);
@@ -3993,6 +4073,17 @@ static int ieee80211_reset_tid_config(struct wiphy *wiphy,
return ret;
}
+static int ieee80211_set_sar_specs(struct wiphy *wiphy,
+ struct cfg80211_sar_specs *sar)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ if (!local->ops->set_sar_specs)
+ return -EOPNOTSUPP;
+
+ return local->ops->set_sar_specs(&local->hw, sar);
+}
+
const struct cfg80211_ops mac80211_config_ops = {
.add_virtual_intf = ieee80211_add_iface,
.del_virtual_intf = ieee80211_del_iface,
@@ -4048,7 +4139,6 @@ const struct cfg80211_ops mac80211_config_ops = {
.set_wiphy_params = ieee80211_set_wiphy_params,
.set_tx_power = ieee80211_set_tx_power,
.get_tx_power = ieee80211_get_tx_power,
- .set_wds_peer = ieee80211_set_wds_peer,
.rfkill_poll = ieee80211_rfkill_poll,
CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd)
CFG80211_TESTMODE_DUMP(ieee80211_testmode_dump)
@@ -4096,4 +4186,5 @@ const struct cfg80211_ops mac80211_config_ops = {
.probe_mesh_link = ieee80211_probe_mesh_link,
.set_tid_config = ieee80211_set_tid_config,
.reset_tid_config = ieee80211_reset_tid_config,
+ .set_sar_specs = ieee80211_set_sar_specs,
};
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index bdc0f29dc6cd..907bb1f748a1 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -9,6 +9,7 @@
#include <net/cfg80211.h>
#include "ieee80211_i.h"
#include "driver-ops.h"
+#include "rate.h"
static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx)
@@ -191,11 +192,13 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local,
return NULL;
}
-enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta)
+static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta)
{
- switch (sta->bandwidth) {
+ enum ieee80211_sta_rx_bandwidth width = ieee80211_sta_cap_rx_bw(sta);
+
+ switch (width) {
case IEEE80211_STA_RX_BW_20:
- if (sta->ht_cap.ht_supported)
+ if (sta->sta.ht_cap.ht_supported)
return NL80211_CHAN_WIDTH_20;
else
return NL80211_CHAN_WIDTH_20_NOHT;
@@ -232,7 +235,7 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
!(sta->sdata->bss && sta->sdata->bss == sdata->bss))
continue;
- max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta));
+ max_bw = max(max_bw, ieee80211_get_sta_bw(sta));
}
rcu_read_unlock();
@@ -275,11 +278,11 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
case NL80211_IFTYPE_NAN:
continue;
case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_MESH_POINT:
case NL80211_IFTYPE_OCB:
width = vif->bss_conf.chandef.width;
break;
+ case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_UNSPECIFIED:
case NUM_NL80211_IFTYPES:
case NL80211_IFTYPE_MONITOR:
@@ -343,10 +346,42 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_MIN_WIDTH);
}
+static void ieee80211_chan_bw_change(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ struct sta_info *sta;
+ struct ieee80211_supported_band *sband =
+ local->hw.wiphy->bands[ctx->conf.def.chan->band];
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &local->sta_list,
+ list) {
+ enum ieee80211_sta_rx_bandwidth new_sta_bw;
+
+ if (!ieee80211_sdata_running(sta->sdata))
+ continue;
+
+ if (rcu_access_pointer(sta->sdata->vif.chanctx_conf) !=
+ &ctx->conf)
+ continue;
+
+ new_sta_bw = ieee80211_sta_cur_vht_bw(sta);
+ if (new_sta_bw == sta->sta.bandwidth)
+ continue;
+
+ sta->sta.bandwidth = new_sta_bw;
+ rate_control_rate_update(local, sband, sta,
+ IEEE80211_RC_BW_CHANGED);
+ }
+ rcu_read_unlock();
+}
+
static void ieee80211_change_chanctx(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx,
const struct cfg80211_chan_def *chandef)
{
+ enum nl80211_chan_width width;
+
if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) {
ieee80211_recalc_chanctx_min_def(local, ctx);
return;
@@ -354,7 +389,25 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local,
WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef));
+ width = ctx->conf.def.width;
ctx->conf.def = *chandef;
+
+ /* expected to handle only 20/40/80/160 channel widths */
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_40:
+ case NL80211_CHAN_WIDTH_80:
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_160:
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ if (chandef->width < width)
+ ieee80211_chan_bw_change(local, ctx);
+
drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_WIDTH);
ieee80211_recalc_chanctx_min_def(local, ctx);
@@ -362,6 +415,9 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local,
local->_oper_chandef = *chandef;
ieee80211_hw_config(local, 0);
}
+
+ if (chandef->width > width)
+ ieee80211_chan_bw_change(local, ctx);
}
static struct ieee80211_chanctx *
@@ -536,7 +592,14 @@ static void ieee80211_del_chanctx(struct ieee80211_local *local,
if (!local->use_chanctx) {
struct cfg80211_chan_def *chandef = &local->_oper_chandef;
- chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
+ /* S1G doesn't have 20MHz, so get the correct width for the
+ * current channel.
+ */
+ if (chandef->chan->band == NL80211_BAND_S1GHZ)
+ chandef->width =
+ ieee80211_s1g_channel_width(chandef->chan);
+ else
+ chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
chandef->center_freq1 = chandef->chan->center_freq;
chandef->freq1_offset = chandef->chan->freq_offset;
chandef->center_freq2 = 0;
@@ -736,7 +799,6 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
continue;
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_MESH_POINT:
case NL80211_IFTYPE_OCB:
break;
@@ -1045,8 +1107,14 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
if (WARN_ON(!chandef))
return -EINVAL;
+ if (old_ctx->conf.def.width > new_ctx->conf.def.width)
+ ieee80211_chan_bw_change(local, new_ctx);
+
ieee80211_change_chanctx(local, new_ctx, chandef);
+ if (old_ctx->conf.def.width < new_ctx->conf.def.width)
+ ieee80211_chan_bw_change(local, new_ctx);
+
vif_chsw[0].vif = &sdata->vif;
vif_chsw[0].old_ctx = &old_ctx->conf;
vif_chsw[0].new_ctx = &new_ctx->conf;
@@ -1437,6 +1505,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
ieee80211_recalc_smps_chanctx(local, ctx);
ieee80211_recalc_radar_chanctx(local, ctx);
ieee80211_recalc_chanctx_min_def(local, ctx);
+ ieee80211_chan_bw_change(local, ctx);
list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs,
reserved_chanctx_list) {
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 54080290d6e2..9e723d943421 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -53,7 +53,7 @@ static const struct file_operations name## _ops = { \
DEBUGFS_READONLY_FILE_OPS(name)
#define DEBUGFS_ADD(name) \
- debugfs_create_file(#name, 0400, phyd, local, &name## _ops);
+ debugfs_create_file(#name, 0400, phyd, local, &name## _ops)
#define DEBUGFS_ADD_MODE(name, mode) \
debugfs_create_file(#name, mode, phyd, local, &name## _ops);
@@ -120,18 +120,17 @@ static ssize_t aqm_write(struct file *file,
{
struct ieee80211_local *local = file->private_data;
char buf[100];
- size_t len;
- if (count > sizeof(buf))
+ if (count >= sizeof(buf))
return -EINVAL;
if (copy_from_user(buf, user_buf, count))
return -EFAULT;
- buf[sizeof(buf) - 1] = '\0';
- len = strlen(buf);
- if (len > 0 && buf[len-1] == '\n')
- buf[len-1] = 0;
+ if (count && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ else
+ buf[count] = '\0';
if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1)
return count;
@@ -177,18 +176,17 @@ static ssize_t airtime_flags_write(struct file *file,
{
struct ieee80211_local *local = file->private_data;
char buf[16];
- size_t len;
- if (count > sizeof(buf))
+ if (count >= sizeof(buf))
return -EINVAL;
if (copy_from_user(buf, user_buf, count))
return -EFAULT;
- buf[sizeof(buf) - 1] = 0;
- len = strlen(buf);
- if (len > 0 && buf[len - 1] == '\n')
- buf[len - 1] = 0;
+ if (count && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ else
+ buf[count] = '\0';
if (kstrtou16(buf, 0, &local->airtime_flags))
return -EINVAL;
@@ -237,20 +235,19 @@ static ssize_t aql_txq_limit_write(struct file *file,
{
struct ieee80211_local *local = file->private_data;
char buf[100];
- size_t len;
u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old;
struct sta_info *sta;
- if (count > sizeof(buf))
+ if (count >= sizeof(buf))
return -EINVAL;
if (copy_from_user(buf, user_buf, count))
return -EFAULT;
- buf[sizeof(buf) - 1] = 0;
- len = strlen(buf);
- if (len > 0 && buf[len - 1] == '\n')
- buf[len - 1] = 0;
+ if (count && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ else
+ buf[count] = '\0';
if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3)
return -EINVAL;
@@ -306,18 +303,17 @@ static ssize_t force_tx_status_write(struct file *file,
{
struct ieee80211_local *local = file->private_data;
char buf[3];
- size_t len;
- if (count > sizeof(buf))
+ if (count >= sizeof(buf))
return -EINVAL;
if (copy_from_user(buf, user_buf, count))
return -EFAULT;
- buf[sizeof(buf) - 1] = '\0';
- len = strlen(buf);
- if (len > 0 && buf[len - 1] == '\n')
- buf[len - 1] = 0;
+ if (count && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ else
+ buf[count] = '\0';
if (buf[0] == '0' && buf[1] == '\0')
local->force_tx_status = 0;
@@ -408,6 +404,7 @@ static const char *hw_flag_names[] = {
FLAG(SUPPORTS_MULTI_BSSID),
FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID),
FLAG(AMPDU_KEYBORDER_SUPPORT),
+ FLAG(SUPPORTS_TX_ENCAP_OFFLOAD),
#undef FLAG
};
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 98a713475e0f..f53dec8a3d5c 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -319,7 +319,7 @@ KEY_OPS(key);
#define DEBUGFS_ADD(name) \
debugfs_create_file(#name, 0400, key->debugfs.dir, \
- key, &key_##name##_ops);
+ key, &key_##name##_ops)
#define DEBUGFS_ADD_W(name) \
debugfs_create_file(#name, 0600, key->debugfs.dir, \
key, &key_##name##_ops);
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index fe8a7a87e513..0ad3860852ff 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -574,9 +574,6 @@ static ssize_t ieee80211_if_parse_tsf(
IEEE80211_IF_FILE_RW(tsf);
-/* WDS attributes */
-IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC);
-
#ifdef CONFIG_MAC80211_MESH
IEEE80211_IF_FILE(estab_plinks, u.mesh.estab_plinks, ATOMIC);
@@ -645,7 +642,7 @@ IEEE80211_IF_FILE(dot11MeshConnectedToAuthServer,
#define DEBUGFS_ADD_MODE(name, mode) \
debugfs_create_file(#name, mode, sdata->vif.debugfs_dir, \
- sdata, &name##_ops);
+ sdata, &name##_ops)
#define DEBUGFS_ADD(name) DEBUGFS_ADD_MODE(name, 0400)
@@ -701,11 +698,6 @@ static void add_ibss_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD_MODE(tsf, 0600);
}
-static void add_wds_files(struct ieee80211_sub_if_data *sdata)
-{
- DEBUGFS_ADD(peer);
-}
-
#ifdef CONFIG_MAC80211_MESH
static void add_mesh_files(struct ieee80211_sub_if_data *sdata)
@@ -719,7 +711,7 @@ static void add_mesh_stats(struct ieee80211_sub_if_data *sdata)
struct dentry *dir = debugfs_create_dir("mesh_stats",
sdata->vif.debugfs_dir);
#define MESHSTATS_ADD(name)\
- debugfs_create_file(#name, 0400, dir, sdata, &name##_ops);
+ debugfs_create_file(#name, 0400, dir, sdata, &name##_ops)
MESHSTATS_ADD(fwded_mcast);
MESHSTATS_ADD(fwded_unicast);
@@ -736,7 +728,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata)
sdata->vif.debugfs_dir);
#define MESHPARAMS_ADD(name) \
- debugfs_create_file(#name, 0600, dir, sdata, &name##_ops);
+ debugfs_create_file(#name, 0600, dir, sdata, &name##_ops)
MESHPARAMS_ADD(dot11MeshMaxRetries);
MESHPARAMS_ADD(dot11MeshRetryTimeout);
@@ -805,9 +797,6 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
case NL80211_IFTYPE_AP_VLAN:
add_vlan_files(sdata);
break;
- case NL80211_IFTYPE_WDS:
- add_wds_files(sdata);
- break;
default:
break;
}
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 829dcad69c2c..eb4bb79d936a 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -274,7 +274,7 @@ static ssize_t sta_aql_read(struct file *file, char __user *userbuf,
"Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n",
q_depth[0], q_depth[1], q_depth[2], q_depth[3],
q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1],
- q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]),
+ q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]);
rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
kfree(buf);
@@ -985,7 +985,7 @@ STA_OPS(he_capa);
#define DEBUGFS_ADD(name) \
debugfs_create_file(#name, 0400, \
- sta->debugfs_dir, sta, &sta_ ##name## _ops);
+ sta->debugfs_dir, sta, &sta_ ##name## _ops)
#define DEBUGFS_ADD_COUNTER(name, field) \
debugfs_create_ulong(#name, 0400, sta->debugfs_dir, &sta->field);
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index c9a8a2433e8a..48322e45e7dd 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -125,8 +125,11 @@ int drv_sta_state(struct ieee80211_local *local,
} else if (old_state == IEEE80211_STA_AUTH &&
new_state == IEEE80211_STA_ASSOC) {
ret = drv_sta_add(local, sdata, &sta->sta);
- if (ret == 0)
+ if (ret == 0) {
sta->uploaded = true;
+ if (rcu_access_pointer(sta->sta.rates))
+ drv_sta_rate_tbl_update(local, sdata, &sta->sta);
+ }
} else if (old_state == IEEE80211_STA_ASSOC &&
new_state == IEEE80211_STA_AUTH) {
drv_sta_remove(local, sdata, &sta->sta);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 41d495d73d3a..bcdfd19a596b 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1384,4 +1384,33 @@ static inline int drv_reset_tid_config(struct ieee80211_local *local,
return ret;
}
+
+static inline void drv_update_vif_offload(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+ check_sdata_in_driver(sdata);
+
+ if (!local->ops->update_vif_offload)
+ return;
+
+ trace_drv_update_vif_offload(local, sdata);
+ local->ops->update_vif_offload(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+static inline void drv_sta_set_4addr(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta, bool enabled)
+{
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_sta_set_4addr(local, sdata, sta, enabled);
+ if (local->ops->sta_set_4addr)
+ local->ops->sta_set_4addr(&local->hw, &sdata->vif, sta, enabled);
+ trace_drv_return_void(local);
+}
+
#endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 53632c2f5217..1f552f374e97 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -145,9 +145,9 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata,
*pos++ = csa_settings->block_tx ? 1 : 0;
*pos++ = ieee80211_frequency_to_channel(
csa_settings->chandef.chan->center_freq);
- presp->csa_counter_offsets[0] = (pos - presp->head);
+ presp->cntdwn_counter_offsets[0] = (pos - presp->head);
*pos++ = csa_settings->count;
- presp->csa_current_counter = csa_settings->count;
+ presp->cntdwn_current_counter = csa_settings->count;
}
/* put the remaining rates in WLAN_EID_EXT_SUPP_RATES */
@@ -1037,7 +1037,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
}
if (sta && !sta->sta.wme &&
- elems->wmm_info && local->hw.queues >= IEEE80211_NUM_ACS) {
+ (elems->wmm_info || elems->s1g_capab) &&
+ local->hw.queues >= IEEE80211_NUM_ACS) {
sta->sta.wme = true;
ieee80211_check_fast_xmit(sta);
}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 0b1eaec6649f..8e281c2e644d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -259,15 +259,27 @@ struct beacon_data {
u8 *head, *tail;
int head_len, tail_len;
struct ieee80211_meshconf_ie *meshconf;
- u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM];
- u8 csa_current_counter;
+ u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM];
+ u8 cntdwn_current_counter;
struct rcu_head rcu_head;
};
struct probe_resp {
struct rcu_head rcu_head;
int len;
- u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM];
+ u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM];
+ u8 data[];
+};
+
+struct fils_discovery_data {
+ struct rcu_head rcu_head;
+ int len;
+ u8 data[];
+};
+
+struct unsol_bcast_probe_resp_data {
+ struct rcu_head rcu_head;
+ int len;
u8 data[];
};
@@ -286,6 +298,8 @@ struct ps_data {
struct ieee80211_if_ap {
struct beacon_data __rcu *beacon;
struct probe_resp __rcu *probe_resp;
+ struct fils_discovery_data __rcu *fils_discovery;
+ struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp;
/* to be used after channel switch. */
struct cfg80211_beacon_data *next_beacon;
@@ -297,11 +311,6 @@ struct ieee80211_if_ap {
bool multicast_to_unicast;
};
-struct ieee80211_if_wds {
- struct sta_info *sta;
- u8 remote_addr[ETH_ALEN];
-};
-
struct ieee80211_if_vlan {
struct list_head list; /* write-protected with RTNL and local->mtx */
@@ -443,7 +452,9 @@ struct ieee80211_if_managed {
unsigned long probe_timeout;
int probe_send_count;
bool nullfunc_failed;
- bool connection_loss;
+ u8 connection_loss:1,
+ driver_disconnect:1,
+ reconnect:1;
struct cfg80211_bss *associated;
struct ieee80211_mgd_auth_data *auth_data;
@@ -530,6 +541,8 @@ struct ieee80211_if_managed {
struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */
struct ieee80211_vht_cap vht_capa; /* configured VHT overrides */
struct ieee80211_vht_cap vht_capa_mask; /* Valid parts of vht_capa */
+ struct ieee80211_s1g_cap s1g_capa; /* configured S1G overrides */
+ struct ieee80211_s1g_cap s1g_capa_mask; /* valid s1g_capa bits */
/* TDLS support */
u8 tdls_peer[ETH_ALEN] __aligned(2);
@@ -969,7 +982,6 @@ struct ieee80211_sub_if_data {
union {
struct ieee80211_if_ap ap;
- struct ieee80211_if_wds wds;
struct ieee80211_if_vlan vlan;
struct ieee80211_if_managed mgd;
struct ieee80211_if_ibss ibss;
@@ -989,8 +1001,6 @@ struct ieee80211_sub_if_data {
} debugfs;
#endif
- bool hw_80211_encap;
-
/* must be last, dynamically sized area in this! */
struct ieee80211_vif vif;
};
@@ -1068,6 +1078,7 @@ enum queue_stop_reason {
IEEE80211_QUEUE_STOP_REASON_FLUSH,
IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN,
IEEE80211_QUEUE_STOP_REASON_RESERVE_TID,
+ IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE,
IEEE80211_QUEUE_STOP_REASONS,
};
@@ -1364,7 +1375,6 @@ struct ieee80211_local {
*/
bool pspolling;
- bool offchannel_ps_enabled;
/*
* PS can only be enabled when we have exactly one managed
* interface (and monitors) in PS, this then points there.
@@ -1522,6 +1532,10 @@ struct ieee802_11_elems {
u8 dtim_count;
u8 dtim_period;
const struct ieee80211_addba_ext_ie *addba_ext_ie;
+ const struct ieee80211_s1g_cap *s1g_capab;
+ const struct ieee80211_s1g_oper_ie *s1g_oper;
+ const struct ieee80211_s1g_bcn_compat_ie *s1g_bcn_compat;
+ const struct ieee80211_aid_response_ie *aid_resp;
/* length of them, respectively */
u8 ext_capab_len;
@@ -1576,13 +1590,8 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
{
WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START &&
status->flag & RX_FLAG_MACTIME_END);
- if (status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END))
- return true;
- /* can't handle non-legacy preamble yet */
- if (status->flag & RX_FLAG_MACTIME_PLCP_START &&
- status->encoding == RX_ENC_LEGACY)
- return true;
- return false;
+ return !!(status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END |
+ RX_FLAG_MACTIME_PLCP_START));
}
void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata);
@@ -1640,6 +1649,8 @@ int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb);
+void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata);
void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata);
@@ -1767,6 +1778,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local);
bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata);
void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
bool update_bss);
+void ieee80211_recalc_offload(struct ieee80211_local *local);
static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
{
@@ -1775,7 +1787,7 @@ static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
/* tx handling */
void ieee80211_clear_tx_pending(struct ieee80211_local *local);
-void ieee80211_tx_pending(unsigned long data);
+void ieee80211_tx_pending(struct tasklet_struct *t);
netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
struct net_device *dev);
netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
@@ -1908,6 +1920,9 @@ void
ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif,
const struct ieee80211_he_operation *he_op_ie_elem);
+/* S1G */
+void ieee80211_s1g_sta_rate_init(struct sta_info *sta);
+
/* Spectrum management */
void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt,
@@ -2040,8 +2055,6 @@ void ieee80211_dynamic_ps_timer(struct timer_list *t);
void ieee80211_send_nullfunc(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
bool powersave);
-void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_hdr *hdr);
void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
struct ieee80211_hdr *hdr, bool ack, u16 tx_time);
@@ -2125,7 +2138,7 @@ void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata);
void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
struct txq_info *txqi);
-void ieee80211_wake_txqs(unsigned long data);
+void ieee80211_wake_txqs(struct tasklet_struct *t);
void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
u16 transaction, u16 auth_alg, u16 status,
const u8 *extra, size_t extra_len, const u8 *bssid,
@@ -2193,6 +2206,11 @@ int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, bool need_basic,
enum nl80211_band band);
u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
+void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta_s1g_cap *caps,
+ struct sk_buff *skb);
+void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
/* channel management */
bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
@@ -2204,6 +2222,8 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
const struct ieee80211_he_operation *he_oper,
struct cfg80211_chan_def *chandef);
+bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
+ struct cfg80211_chan_def *chandef);
u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c);
int __must_check
@@ -2258,7 +2278,6 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
enum ieee80211_chanctx_mode chanmode,
u8 radar_detect);
int ieee80211_max_num_channels(struct ieee80211_local *local);
-enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta);
void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx);
@@ -2282,6 +2301,9 @@ void ieee80211_tdls_chsw_work(struct work_struct *wk);
void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata,
const u8 *peer, u16 reason);
const char *ieee80211_get_reason_code_string(u16 reason_code);
+u16 ieee80211_encode_usf(int val);
+u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
+ enum nl80211_iftype type);
extern const struct ethtool_ops ieee80211_ethtool_ops;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 9740ae8fa697..b31417f40bd5 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -230,10 +230,6 @@ static inline int identical_mac_addr_allowed(int type1, int type2)
type2 == NL80211_IFTYPE_MONITOR ||
type1 == NL80211_IFTYPE_P2P_DEVICE ||
type2 == NL80211_IFTYPE_P2P_DEVICE ||
- (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_WDS) ||
- (type1 == NL80211_IFTYPE_WDS &&
- (type2 == NL80211_IFTYPE_WDS ||
- type2 == NL80211_IFTYPE_AP)) ||
(type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) ||
(type1 == NL80211_IFTYPE_AP_VLAN &&
(type2 == NL80211_IFTYPE_AP ||
@@ -348,439 +344,6 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata,
return 0;
}
-void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
- const int offset)
-{
- struct ieee80211_local *local = sdata->local;
- u32 flags = sdata->u.mntr.flags;
-
-#define ADJUST(_f, _s) do { \
- if (flags & MONITOR_FLAG_##_f) \
- local->fif_##_s += offset; \
- } while (0)
-
- ADJUST(FCSFAIL, fcsfail);
- ADJUST(PLCPFAIL, plcpfail);
- ADJUST(CONTROL, control);
- ADJUST(CONTROL, pspoll);
- ADJUST(OTHER_BSS, other_bss);
-
-#undef ADJUST
-}
-
-static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata)
-{
- struct ieee80211_local *local = sdata->local;
- int i;
-
- for (i = 0; i < IEEE80211_NUM_ACS; i++) {
- if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
- sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE;
- else if (local->hw.queues >= IEEE80211_NUM_ACS)
- sdata->vif.hw_queue[i] = i;
- else
- sdata->vif.hw_queue[i] = 0;
- }
- sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
-}
-
-int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
-{
- struct ieee80211_sub_if_data *sdata;
- int ret;
-
- if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
- return 0;
-
- ASSERT_RTNL();
-
- if (local->monitor_sdata)
- return 0;
-
- sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL);
- if (!sdata)
- return -ENOMEM;
-
- /* set up data */
- sdata->local = local;
- sdata->vif.type = NL80211_IFTYPE_MONITOR;
- snprintf(sdata->name, IFNAMSIZ, "%s-monitor",
- wiphy_name(local->hw.wiphy));
- sdata->wdev.iftype = NL80211_IFTYPE_MONITOR;
-
- sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
-
- ieee80211_set_default_queues(sdata);
-
- ret = drv_add_interface(local, sdata);
- if (WARN_ON(ret)) {
- /* ok .. stupid driver, it asked for this! */
- kfree(sdata);
- return ret;
- }
-
- ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR);
- if (ret) {
- kfree(sdata);
- return ret;
- }
-
- mutex_lock(&local->iflist_mtx);
- rcu_assign_pointer(local->monitor_sdata, sdata);
- mutex_unlock(&local->iflist_mtx);
-
- mutex_lock(&local->mtx);
- ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef,
- IEEE80211_CHANCTX_EXCLUSIVE);
- mutex_unlock(&local->mtx);
- if (ret) {
- mutex_lock(&local->iflist_mtx);
- RCU_INIT_POINTER(local->monitor_sdata, NULL);
- mutex_unlock(&local->iflist_mtx);
- synchronize_net();
- drv_remove_interface(local, sdata);
- kfree(sdata);
- return ret;
- }
-
- skb_queue_head_init(&sdata->skb_queue);
- INIT_WORK(&sdata->work, ieee80211_iface_work);
-
- return 0;
-}
-
-void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
-{
- struct ieee80211_sub_if_data *sdata;
-
- if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
- return;
-
- ASSERT_RTNL();
-
- mutex_lock(&local->iflist_mtx);
-
- sdata = rcu_dereference_protected(local->monitor_sdata,
- lockdep_is_held(&local->iflist_mtx));
- if (!sdata) {
- mutex_unlock(&local->iflist_mtx);
- return;
- }
-
- RCU_INIT_POINTER(local->monitor_sdata, NULL);
- mutex_unlock(&local->iflist_mtx);
-
- synchronize_net();
-
- mutex_lock(&local->mtx);
- ieee80211_vif_release_channel(sdata);
- mutex_unlock(&local->mtx);
-
- drv_remove_interface(local, sdata);
-
- kfree(sdata);
-}
-
-/*
- * NOTE: Be very careful when changing this function, it must NOT return
- * an error on interface type changes that have been pre-checked, so most
- * checks should be in ieee80211_check_concurrent_iface.
- */
-int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
-{
- struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
- struct net_device *dev = wdev->netdev;
- struct ieee80211_local *local = sdata->local;
- struct sta_info *sta;
- u32 changed = 0;
- int res;
- u32 hw_reconf_flags = 0;
-
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_WDS:
- if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
- return -ENOLINK;
- break;
- case NL80211_IFTYPE_AP_VLAN: {
- struct ieee80211_sub_if_data *master;
-
- if (!sdata->bss)
- return -ENOLINK;
-
- mutex_lock(&local->mtx);
- list_add(&sdata->u.vlan.list, &sdata->bss->vlans);
- mutex_unlock(&local->mtx);
-
- master = container_of(sdata->bss,
- struct ieee80211_sub_if_data, u.ap);
- sdata->control_port_protocol =
- master->control_port_protocol;
- sdata->control_port_no_encrypt =
- master->control_port_no_encrypt;
- sdata->control_port_over_nl80211 =
- master->control_port_over_nl80211;
- sdata->control_port_no_preauth =
- master->control_port_no_preauth;
- sdata->vif.cab_queue = master->vif.cab_queue;
- memcpy(sdata->vif.hw_queue, master->vif.hw_queue,
- sizeof(sdata->vif.hw_queue));
- sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef;
-
- mutex_lock(&local->key_mtx);
- sdata->crypto_tx_tailroom_needed_cnt +=
- master->crypto_tx_tailroom_needed_cnt;
- mutex_unlock(&local->key_mtx);
-
- break;
- }
- case NL80211_IFTYPE_AP:
- sdata->bss = &sdata->u.ap;
- break;
- case NL80211_IFTYPE_MESH_POINT:
- case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_MONITOR:
- case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_P2P_DEVICE:
- case NL80211_IFTYPE_OCB:
- case NL80211_IFTYPE_NAN:
- /* no special treatment */
- break;
- case NL80211_IFTYPE_UNSPECIFIED:
- case NUM_NL80211_IFTYPES:
- case NL80211_IFTYPE_P2P_CLIENT:
- case NL80211_IFTYPE_P2P_GO:
- /* cannot happen */
- WARN_ON(1);
- break;
- }
-
- if (local->open_count == 0) {
- res = drv_start(local);
- if (res)
- goto err_del_bss;
- /* we're brought up, everything changes */
- hw_reconf_flags = ~0;
- ieee80211_led_radio(local, true);
- ieee80211_mod_tpt_led_trig(local,
- IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
- }
-
- /*
- * Copy the hopefully now-present MAC address to
- * this interface, if it has the special null one.
- */
- if (dev && is_zero_ether_addr(dev->dev_addr)) {
- memcpy(dev->dev_addr,
- local->hw.wiphy->perm_addr,
- ETH_ALEN);
- memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN);
-
- if (!is_valid_ether_addr(dev->dev_addr)) {
- res = -EADDRNOTAVAIL;
- goto err_stop;
- }
- }
-
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_AP_VLAN:
- /* no need to tell driver, but set carrier and chanctx */
- if (rtnl_dereference(sdata->bss->beacon)) {
- ieee80211_vif_vlan_copy_chanctx(sdata);
- netif_carrier_on(dev);
- } else {
- netif_carrier_off(dev);
- }
- break;
- case NL80211_IFTYPE_MONITOR:
- if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
- local->cooked_mntrs++;
- break;
- }
-
- if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
- res = drv_add_interface(local, sdata);
- if (res)
- goto err_stop;
- } else if (local->monitors == 0 && local->open_count == 0) {
- res = ieee80211_add_virtual_monitor(local);
- if (res)
- goto err_stop;
- }
-
- /* must be before the call to ieee80211_configure_filter */
- local->monitors++;
- if (local->monitors == 1) {
- local->hw.conf.flags |= IEEE80211_CONF_MONITOR;
- hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
- }
-
- ieee80211_adjust_monitor_flags(sdata, 1);
- ieee80211_configure_filter(local);
- mutex_lock(&local->mtx);
- ieee80211_recalc_idle(local);
- mutex_unlock(&local->mtx);
-
- netif_carrier_on(dev);
- break;
- default:
- if (coming_up) {
- ieee80211_del_virtual_monitor(local);
-
- res = drv_add_interface(local, sdata);
- if (res)
- goto err_stop;
- res = ieee80211_check_queues(sdata,
- ieee80211_vif_type_p2p(&sdata->vif));
- if (res)
- goto err_del_interface;
- }
-
- if (sdata->vif.type == NL80211_IFTYPE_AP) {
- local->fif_pspoll++;
- local->fif_probe_req++;
-
- ieee80211_configure_filter(local);
- } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
- local->fif_probe_req++;
- }
-
- if (sdata->vif.probe_req_reg)
- drv_config_iface_filter(local, sdata,
- FIF_PROBE_REQ,
- FIF_PROBE_REQ);
-
- if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
- sdata->vif.type != NL80211_IFTYPE_NAN)
- changed |= ieee80211_reset_erp_info(sdata);
- ieee80211_bss_info_change_notify(sdata, changed);
-
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_AP:
- case NL80211_IFTYPE_MESH_POINT:
- case NL80211_IFTYPE_OCB:
- netif_carrier_off(dev);
- break;
- case NL80211_IFTYPE_WDS:
- case NL80211_IFTYPE_P2P_DEVICE:
- case NL80211_IFTYPE_NAN:
- break;
- default:
- /* not reached */
- WARN_ON(1);
- }
-
- /*
- * Set default queue parameters so drivers don't
- * need to initialise the hardware if the hardware
- * doesn't start up with sane defaults.
- * Enable QoS for anything but station interfaces.
- */
- ieee80211_set_wmm_default(sdata, true,
- sdata->vif.type != NL80211_IFTYPE_STATION);
- }
-
- set_bit(SDATA_STATE_RUNNING, &sdata->state);
-
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_WDS:
- /* Create STA entry for the WDS peer */
- sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
- GFP_KERNEL);
- if (!sta) {
- res = -ENOMEM;
- goto err_del_interface;
- }
-
- sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
- sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
- sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED);
-
- res = sta_info_insert(sta);
- if (res) {
- /* STA has been freed */
- goto err_del_interface;
- }
-
- rate_control_rate_init(sta);
- netif_carrier_on(dev);
- break;
- case NL80211_IFTYPE_P2P_DEVICE:
- rcu_assign_pointer(local->p2p_sdata, sdata);
- break;
- case NL80211_IFTYPE_MONITOR:
- if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
- break;
- list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list);
- break;
- default:
- break;
- }
-
- /*
- * set_multicast_list will be invoked by the networking core
- * which will check whether any increments here were done in
- * error and sync them down to the hardware as filter flags.
- */
- if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
- atomic_inc(&local->iff_allmultis);
-
- if (coming_up)
- local->open_count++;
-
- if (hw_reconf_flags)
- ieee80211_hw_config(local, hw_reconf_flags);
-
- ieee80211_recalc_ps(local);
-
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- local->ops->wake_tx_queue) {
- /* XXX: for AP_VLAN, actually track AP queues */
- if (dev)
- netif_tx_start_all_queues(dev);
- } else if (dev) {
- unsigned long flags;
- int n_acs = IEEE80211_NUM_ACS;
- int ac;
-
- if (local->hw.queues < IEEE80211_NUM_ACS)
- n_acs = 1;
-
- spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
- if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE ||
- (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 &&
- skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) {
- for (ac = 0; ac < n_acs; ac++) {
- int ac_queue = sdata->vif.hw_queue[ac];
-
- if (local->queue_stop_reasons[ac_queue] == 0 &&
- skb_queue_empty(&local->pending[ac_queue]))
- netif_start_subqueue(dev, ac);
- }
- }
- spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
- }
-
- return 0;
- err_del_interface:
- drv_remove_interface(local, sdata);
- err_stop:
- if (!local->open_count)
- drv_stop(local);
- err_del_bss:
- sdata->bss = NULL;
- if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
- mutex_lock(&local->mtx);
- list_del(&sdata->u.vlan.list);
- mutex_unlock(&local->mtx);
- }
- /* might already be clear but that doesn't matter */
- clear_bit(SDATA_STATE_RUNNING, &sdata->state);
- return res;
-}
-
static int ieee80211_open(struct net_device *dev)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -850,15 +413,12 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
* (because if we remove a STA after ops->remove_interface()
* the driver will have removed the vif info already!)
*
- * In WDS mode a station must exist here and be flushed, for
- * AP_VLANs stations may exist since there's nothing else that
+ * For AP_VLANs stations may exist since there's nothing else that
* would have removed them, but in other modes there shouldn't
* be any stations.
*/
flushed = sta_info_flush(sdata);
- WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
- ((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) ||
- (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)));
+ WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN && flushed > 0);
/* don't count this interface for allmulti while it is down */
if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
@@ -985,8 +545,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
* When we get here, the interface is marked down.
* Free the remaining keys, if there are any
* (which can happen in AP mode if userspace sets
- * keys before the interface is operating, and maybe
- * also in WDS mode)
+ * keys before the interface is operating)
*
* Force the key freeing to always synchronize_net()
* to wait for the RX path in case it is using this
@@ -1142,28 +701,7 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev,
static void
ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_sw_netstats *tstats;
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- tstats = per_cpu_ptr(dev->tstats, i);
-
- do {
- start = u64_stats_fetch_begin_irq(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
-
- stats->rx_packets += rx_packets;
- stats->tx_packets += tx_packets;
- stats->rx_bytes += rx_bytes;
- stats->tx_bytes += tx_bytes;
- }
+ dev_fetch_sw_netstats(stats, dev->tstats);
}
static const struct net_device_ops ieee80211_dataif_ops = {
@@ -1227,60 +765,522 @@ static const struct net_device_ops ieee80211_dataif_8023_ops = {
.ndo_get_stats64 = ieee80211_get_stats64,
};
-static void __ieee80211_set_hw_80211_encap(struct ieee80211_sub_if_data *sdata,
- bool enable)
+static bool ieee80211_iftype_supports_encap_offload(enum nl80211_iftype iftype)
{
- sdata->dev->netdev_ops = enable ? &ieee80211_dataif_8023_ops :
- &ieee80211_dataif_ops;
- sdata->hw_80211_encap = enable;
+ switch (iftype) {
+ /* P2P GO and client are mapped to AP/STATION types */
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_STATION:
+ return true;
+ default:
+ return false;
+ }
}
-bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable)
+static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdata)
{
- struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct ieee80211_local *local = sdata->local;
- struct ieee80211_sub_if_data *iter;
- struct ieee80211_key *key;
+ u32 flags;
+
+ flags = sdata->vif.offload_flags;
+
+ if (ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) &&
+ ieee80211_iftype_supports_encap_offload(sdata->vif.type)) {
+ flags |= IEEE80211_OFFLOAD_ENCAP_ENABLED;
+
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) &&
+ local->hw.wiphy->frag_threshold != (u32)-1)
+ flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED;
+
+ if (local->monitors)
+ flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED;
+ } else {
+ flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED;
+ }
+
+ if (sdata->vif.offload_flags == flags)
+ return false;
+
+ sdata->vif.offload_flags = flags;
+ return true;
+}
+
+static void ieee80211_set_vif_encap_ops(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *bss = sdata;
+ bool enabled;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ if (!sdata->bss)
+ return;
+
+ bss = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap);
+ }
+
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) ||
+ !ieee80211_iftype_supports_encap_offload(bss->vif.type))
+ return;
+
+ enabled = bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED;
+ if (sdata->wdev.use_4addr &&
+ !(bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_4ADDR))
+ enabled = false;
+
+ sdata->dev->netdev_ops = enabled ? &ieee80211_dataif_8023_ops :
+ &ieee80211_dataif_ops;
+}
+
+static void ieee80211_recalc_sdata_offload(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *vsdata;
+
+ if (ieee80211_set_sdata_offload_flags(sdata)) {
+ drv_update_vif_offload(local, sdata);
+ ieee80211_set_vif_encap_ops(sdata);
+ }
+
+ list_for_each_entry(vsdata, &local->interfaces, list) {
+ if (vsdata->vif.type != NL80211_IFTYPE_AP_VLAN ||
+ vsdata->bss != &sdata->u.ap)
+ continue;
+
+ ieee80211_set_vif_encap_ops(vsdata);
+ }
+}
+
+void ieee80211_recalc_offload(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD))
+ return;
mutex_lock(&local->iflist_mtx);
- list_for_each_entry(iter, &local->interfaces, list) {
- struct ieee80211_sub_if_data *disable = NULL;
-
- if (vif->type == NL80211_IFTYPE_MONITOR) {
- disable = iter;
- __ieee80211_set_hw_80211_encap(iter, false);
- } else if (iter->vif.type == NL80211_IFTYPE_MONITOR) {
- disable = sdata;
- enable = false;
- }
- if (disable)
- sdata_dbg(disable,
- "disable hw 80211 encap due to mon co-exist\n");
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ ieee80211_recalc_sdata_offload(sdata);
}
+
mutex_unlock(&local->iflist_mtx);
+}
- if (enable == sdata->hw_80211_encap)
- return enable;
+void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
+ const int offset)
+{
+ struct ieee80211_local *local = sdata->local;
+ u32 flags = sdata->u.mntr.flags;
- if (!sdata->dev)
- return false;
+#define ADJUST(_f, _s) do { \
+ if (flags & MONITOR_FLAG_##_f) \
+ local->fif_##_s += offset; \
+ } while (0)
+
+ ADJUST(FCSFAIL, fcsfail);
+ ADJUST(PLCPFAIL, plcpfail);
+ ADJUST(CONTROL, control);
+ ADJUST(CONTROL, pspoll);
+ ADJUST(OTHER_BSS, other_bss);
+
+#undef ADJUST
+}
+
+static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ int i;
+
+ for (i = 0; i < IEEE80211_NUM_ACS; i++) {
+ if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
+ sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE;
+ else if (local->hw.queues >= IEEE80211_NUM_ACS)
+ sdata->vif.hw_queue[i] = i;
+ else
+ sdata->vif.hw_queue[i] = 0;
+ }
+ sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
+}
+
+int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+ int ret;
+
+ if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
+ return 0;
+
+ ASSERT_RTNL();
+
+ if (local->monitor_sdata)
+ return 0;
+
+ sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL);
+ if (!sdata)
+ return -ENOMEM;
+
+ /* set up data */
+ sdata->local = local;
+ sdata->vif.type = NL80211_IFTYPE_MONITOR;
+ snprintf(sdata->name, IFNAMSIZ, "%s-monitor",
+ wiphy_name(local->hw.wiphy));
+ sdata->wdev.iftype = NL80211_IFTYPE_MONITOR;
+
+ sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
+
+ ieee80211_set_default_queues(sdata);
+
+ ret = drv_add_interface(local, sdata);
+ if (WARN_ON(ret)) {
+ /* ok .. stupid driver, it asked for this! */
+ kfree(sdata);
+ return ret;
+ }
+
+ set_bit(SDATA_STATE_RUNNING, &sdata->state);
+
+ ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR);
+ if (ret) {
+ kfree(sdata);
+ return ret;
+ }
+
+ mutex_lock(&local->iflist_mtx);
+ rcu_assign_pointer(local->monitor_sdata, sdata);
+ mutex_unlock(&local->iflist_mtx);
+
+ mutex_lock(&local->mtx);
+ ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef,
+ IEEE80211_CHANCTX_EXCLUSIVE);
+ mutex_unlock(&local->mtx);
+ if (ret) {
+ mutex_lock(&local->iflist_mtx);
+ RCU_INIT_POINTER(local->monitor_sdata, NULL);
+ mutex_unlock(&local->iflist_mtx);
+ synchronize_net();
+ drv_remove_interface(local, sdata);
+ kfree(sdata);
+ return ret;
+ }
+
+ skb_queue_head_init(&sdata->skb_queue);
+ INIT_WORK(&sdata->work, ieee80211_iface_work);
+
+ return 0;
+}
+
+void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
+ return;
+
+ ASSERT_RTNL();
+
+ mutex_lock(&local->iflist_mtx);
+
+ sdata = rcu_dereference_protected(local->monitor_sdata,
+ lockdep_is_held(&local->iflist_mtx));
+ if (!sdata) {
+ mutex_unlock(&local->iflist_mtx);
+ return;
+ }
+
+ RCU_INIT_POINTER(local->monitor_sdata, NULL);
+ mutex_unlock(&local->iflist_mtx);
+
+ synchronize_net();
+
+ mutex_lock(&local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&local->mtx);
+
+ drv_remove_interface(local, sdata);
+
+ kfree(sdata);
+}
+
+/*
+ * NOTE: Be very careful when changing this function, it must NOT return
+ * an error on interface type changes that have been pre-checked, so most
+ * checks should be in ieee80211_check_concurrent_iface.
+ */
+int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct net_device *dev = wdev->netdev;
+ struct ieee80211_local *local = sdata->local;
+ u32 changed = 0;
+ int res;
+ u32 hw_reconf_flags = 0;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN: {
+ struct ieee80211_sub_if_data *master;
+
+ if (!sdata->bss)
+ return -ENOLINK;
+
+ mutex_lock(&local->mtx);
+ list_add(&sdata->u.vlan.list, &sdata->bss->vlans);
+ mutex_unlock(&local->mtx);
+
+ master = container_of(sdata->bss,
+ struct ieee80211_sub_if_data, u.ap);
+ sdata->control_port_protocol =
+ master->control_port_protocol;
+ sdata->control_port_no_encrypt =
+ master->control_port_no_encrypt;
+ sdata->control_port_over_nl80211 =
+ master->control_port_over_nl80211;
+ sdata->control_port_no_preauth =
+ master->control_port_no_preauth;
+ sdata->vif.cab_queue = master->vif.cab_queue;
+ memcpy(sdata->vif.hw_queue, master->vif.hw_queue,
+ sizeof(sdata->vif.hw_queue));
+ sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef;
+
+ mutex_lock(&local->key_mtx);
+ sdata->crypto_tx_tailroom_needed_cnt +=
+ master->crypto_tx_tailroom_needed_cnt;
+ mutex_unlock(&local->key_mtx);
+
+ break;
+ }
+ case NL80211_IFTYPE_AP:
+ sdata->bss = &sdata->u.ap;
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_OCB:
+ case NL80211_IFTYPE_NAN:
+ /* no special treatment */
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_WDS:
+ /* cannot happen */
+ WARN_ON(1);
+ break;
+ }
+
+ if (local->open_count == 0) {
+ res = drv_start(local);
+ if (res)
+ goto err_del_bss;
+ /* we're brought up, everything changes */
+ hw_reconf_flags = ~0;
+ ieee80211_led_radio(local, true);
+ ieee80211_mod_tpt_led_trig(local,
+ IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
+ }
- if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) &&
- (local->hw.wiphy->frag_threshold != (u32)-1))
- enable = false;
+ /*
+ * Copy the hopefully now-present MAC address to
+ * this interface, if it has the special null one.
+ */
+ if (dev && is_zero_ether_addr(dev->dev_addr)) {
+ memcpy(dev->dev_addr,
+ local->hw.wiphy->perm_addr,
+ ETH_ALEN);
+ memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN);
- mutex_lock(&sdata->local->key_mtx);
- list_for_each_entry(key, &sdata->key_list, list) {
- if (key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)
- enable = false;
+ if (!is_valid_ether_addr(dev->dev_addr)) {
+ res = -EADDRNOTAVAIL;
+ goto err_stop;
+ }
}
- mutex_unlock(&sdata->local->key_mtx);
- __ieee80211_set_hw_80211_encap(sdata, enable);
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ /* no need to tell driver, but set carrier and chanctx */
+ if (rtnl_dereference(sdata->bss->beacon)) {
+ ieee80211_vif_vlan_copy_chanctx(sdata);
+ netif_carrier_on(dev);
+ ieee80211_set_vif_encap_ops(sdata);
+ } else {
+ netif_carrier_off(dev);
+ }
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
+ local->cooked_mntrs++;
+ break;
+ }
- return enable;
+ if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
+ res = drv_add_interface(local, sdata);
+ if (res)
+ goto err_stop;
+ } else if (local->monitors == 0 && local->open_count == 0) {
+ res = ieee80211_add_virtual_monitor(local);
+ if (res)
+ goto err_stop;
+ }
+
+ /* must be before the call to ieee80211_configure_filter */
+ local->monitors++;
+ if (local->monitors == 1) {
+ local->hw.conf.flags |= IEEE80211_CONF_MONITOR;
+ hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
+ }
+
+ ieee80211_adjust_monitor_flags(sdata, 1);
+ ieee80211_configure_filter(local);
+ ieee80211_recalc_offload(local);
+ mutex_lock(&local->mtx);
+ ieee80211_recalc_idle(local);
+ mutex_unlock(&local->mtx);
+
+ netif_carrier_on(dev);
+ break;
+ default:
+ if (coming_up) {
+ ieee80211_del_virtual_monitor(local);
+ ieee80211_set_sdata_offload_flags(sdata);
+
+ res = drv_add_interface(local, sdata);
+ if (res)
+ goto err_stop;
+
+ ieee80211_set_vif_encap_ops(sdata);
+ res = ieee80211_check_queues(sdata,
+ ieee80211_vif_type_p2p(&sdata->vif));
+ if (res)
+ goto err_del_interface;
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ local->fif_pspoll++;
+ local->fif_probe_req++;
+
+ ieee80211_configure_filter(local);
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ local->fif_probe_req++;
+ }
+
+ if (sdata->vif.probe_req_reg)
+ drv_config_iface_filter(local, sdata,
+ FIF_PROBE_REQ,
+ FIF_PROBE_REQ);
+
+ if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
+ sdata->vif.type != NL80211_IFTYPE_NAN)
+ changed |= ieee80211_reset_erp_info(sdata);
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_OCB:
+ netif_carrier_off(dev);
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
+ break;
+ default:
+ /* not reached */
+ WARN_ON(1);
+ }
+
+ /*
+ * Set default queue parameters so drivers don't
+ * need to initialise the hardware if the hardware
+ * doesn't start up with sane defaults.
+ * Enable QoS for anything but station interfaces.
+ */
+ ieee80211_set_wmm_default(sdata, true,
+ sdata->vif.type != NL80211_IFTYPE_STATION);
+ }
+
+ set_bit(SDATA_STATE_RUNNING, &sdata->state);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_P2P_DEVICE:
+ rcu_assign_pointer(local->p2p_sdata, sdata);
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
+ break;
+ list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list);
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * set_multicast_list will be invoked by the networking core
+ * which will check whether any increments here were done in
+ * error and sync them down to the hardware as filter flags.
+ */
+ if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
+ atomic_inc(&local->iff_allmultis);
+
+ if (coming_up)
+ local->open_count++;
+
+ if (hw_reconf_flags)
+ ieee80211_hw_config(local, hw_reconf_flags);
+
+ ieee80211_recalc_ps(local);
+
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ local->ops->wake_tx_queue) {
+ /* XXX: for AP_VLAN, actually track AP queues */
+ if (dev)
+ netif_tx_start_all_queues(dev);
+ } else if (dev) {
+ unsigned long flags;
+ int n_acs = IEEE80211_NUM_ACS;
+ int ac;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ n_acs = 1;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE ||
+ (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 &&
+ skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) {
+ for (ac = 0; ac < n_acs; ac++) {
+ int ac_queue = sdata->vif.hw_queue[ac];
+
+ if (local->queue_stop_reasons[ac_queue] == 0 &&
+ skb_queue_empty(&local->pending[ac_queue]))
+ netif_start_subqueue(dev, ac);
+ }
+ }
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+ }
+
+ return 0;
+ err_del_interface:
+ drv_remove_interface(local, sdata);
+ err_stop:
+ if (!local->open_count)
+ drv_stop(local);
+ err_del_bss:
+ sdata->bss = NULL;
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ mutex_lock(&local->mtx);
+ list_del(&sdata->u.vlan.list);
+ mutex_unlock(&local->mtx);
+ }
+ /* might already be clear but that doesn't matter */
+ clear_bit(SDATA_STATE_RUNNING, &sdata->state);
+ return res;
}
-EXPORT_SYMBOL(ieee80211_set_hw_80211_encap);
static void ieee80211_if_free(struct net_device *dev)
{
@@ -1323,6 +1323,7 @@ static void ieee80211_iface_work(struct work_struct *work)
while ((skb = skb_dequeue(&sdata->skb_queue))) {
struct ieee80211_mgmt *mgmt = (void *)skb->data;
+ kcov_remote_start_common(skb_get_kcov_handle(skb));
if (ieee80211_is_action(mgmt->frame_control) &&
mgmt->u.action.category == WLAN_CATEGORY_BACK) {
int len = skb->len;
@@ -1379,6 +1380,11 @@ static void ieee80211_iface_work(struct work_struct *work)
WARN_ON(1);
break;
}
+ } else if (ieee80211_is_ext(mgmt->frame_control)) {
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ ieee80211_sta_rx_queued_ext(sdata, skb);
+ else
+ WARN_ON(1);
} else if (ieee80211_is_data_qos(mgmt->frame_control)) {
struct ieee80211_hdr *hdr = (void *)mgmt;
/*
@@ -1427,6 +1433,7 @@ static void ieee80211_iface_work(struct work_struct *work)
}
kfree_skb(skb);
+ kcov_remote_stop();
}
/* then other type-dependent work */
@@ -1484,7 +1491,6 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.txpower = INT_MIN; /* unset */
sdata->noack_map = 0;
- sdata->hw_80211_encap = false;
/* only monitor/p2p-device differ */
if (sdata->dev) {
@@ -1537,9 +1543,6 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
sdata->u.mntr.flags = MONITOR_FLAG_CONTROL |
MONITOR_FLAG_OTHER_BSS;
break;
- case NL80211_IFTYPE_WDS:
- sdata->vif.bss_conf.bssid = NULL;
- break;
case NL80211_IFTYPE_NAN:
idr_init(&sdata->u.nan.function_inst_ids);
spin_lock_init(&sdata->u.nan.func_lock);
@@ -1550,6 +1553,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.bssid = sdata->vif.addr;
break;
case NL80211_IFTYPE_UNSPECIFIED:
+ case NL80211_IFTYPE_WDS:
case NUM_NL80211_IFTYPES:
WARN_ON(1);
break;
@@ -1594,9 +1598,7 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_OCB:
/*
* Could probably support everything
- * but WDS here (WDS do_open can fail
- * under memory pressure, which this
- * code isn't prepared to handle).
+ * but here.
*/
break;
case NL80211_IFTYPE_P2P_CLIENT:
@@ -1615,10 +1617,15 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
if (ret)
return ret;
+ ieee80211_stop_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE);
+ synchronize_net();
+
ieee80211_do_stop(sdata, false);
ieee80211_teardown_sdata(sdata);
+ ieee80211_set_sdata_offload_flags(sdata);
ret = drv_change_interface(local, sdata, internal_type, p2p);
if (ret)
type = ieee80211_vif_type_p2p(&sdata->vif);
@@ -1631,10 +1638,13 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
ieee80211_check_queues(sdata, type);
ieee80211_setup_sdata(sdata, type);
+ ieee80211_set_vif_encap_ops(sdata);
err = ieee80211_do_open(&sdata->wdev, false);
WARN(err, "type change: do_open returned %d", err);
+ ieee80211_wake_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE);
return ret;
}
@@ -1687,7 +1697,6 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local,
case NL80211_IFTYPE_MONITOR:
/* doesn't matter */
break;
- case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_AP_VLAN:
/* match up with an AP interface */
list_for_each_entry(sdata, &local->interfaces, list) {
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 2df636c32432..a4817aa4b171 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -177,13 +177,6 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
}
}
- /* TKIP countermeasures don't work in encap offload mode */
- if (key->conf.cipher == WLAN_CIPHER_SUITE_TKIP &&
- sdata->hw_80211_encap) {
- sdata_dbg(sdata, "TKIP is not allowed in hw 80211 encap mode\n");
- return -EINVAL;
- }
-
ret = drv_set_key(key->local, SET_KEY, sdata,
sta ? &sta->sta : NULL, &key->conf);
@@ -219,14 +212,6 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
case WLAN_CIPHER_SUITE_CCMP_256:
case WLAN_CIPHER_SUITE_GCMP:
case WLAN_CIPHER_SUITE_GCMP_256:
- /* We cannot do software crypto of data frames with
- * encapsulation offload enabled. However for 802.11w to
- * function properly we need cmac/gmac keys.
- */
- if (sdata->hw_80211_encap)
- return -EINVAL;
- fallthrough;
-
case WLAN_CIPHER_SUITE_AES_CMAC:
case WLAN_CIPHER_SUITE_BIP_CMAC_256:
case WLAN_CIPHER_SUITE_BIP_GMAC_128:
@@ -1315,3 +1300,52 @@ ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
return &key->conf;
}
EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_add);
+
+void ieee80211_key_mic_failure(struct ieee80211_key_conf *keyconf)
+{
+ struct ieee80211_key *key;
+
+ key = container_of(keyconf, struct ieee80211_key, conf);
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ key->u.aes_cmac.icverrors++;
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ key->u.aes_gmac.icverrors++;
+ break;
+ default:
+ /* ignore the others for now, we don't keep counters now */
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(ieee80211_key_mic_failure);
+
+void ieee80211_key_replay(struct ieee80211_key_conf *keyconf)
+{
+ struct ieee80211_key *key;
+
+ key = container_of(keyconf, struct ieee80211_key, conf);
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ key->u.ccmp.replays++;
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ key->u.aes_cmac.replays++;
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ key->u.aes_gmac.replays++;
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ key->u.gcmp.replays++;
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(ieee80211_key_replay);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index b4a2efe8e83a..dee88ec566ad 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -220,9 +220,9 @@ u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata)
BSS_CHANGED_ERP_SLOT;
}
-static void ieee80211_tasklet_handler(unsigned long data)
+static void ieee80211_tasklet_handler(struct tasklet_struct *t)
{
- struct ieee80211_local *local = (struct ieee80211_local *) data;
+ struct ieee80211_local *local = from_tasklet(local, t, tasklet);
struct sk_buff *skb;
while ((skb = skb_dequeue(&local->skb_queue)) ||
@@ -733,16 +733,12 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
skb_queue_head_init(&local->pending[i]);
atomic_set(&local->agg_queue_stop[i], 0);
}
- tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending,
- (unsigned long)local);
+ tasklet_setup(&local->tx_pending_tasklet, ieee80211_tx_pending);
if (ops->wake_tx_queue)
- tasklet_init(&local->wake_txqs_tasklet, ieee80211_wake_txqs,
- (unsigned long)local);
+ tasklet_setup(&local->wake_txqs_tasklet, ieee80211_wake_txqs);
- tasklet_init(&local->tasklet,
- ieee80211_tasklet_handler,
- (unsigned long) local);
+ tasklet_setup(&local->tasklet, ieee80211_tasklet_handler);
skb_queue_head_init(&local->skb_queue);
skb_queue_head_init(&local->skb_queue_unreliable);
@@ -935,14 +931,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
return -EINVAL;
}
} else {
- /*
- * WDS is currently prohibited when channel contexts are used
- * because there's no clear definition of which channel WDS
- * type interfaces use
- */
- if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS))
- return -EINVAL;
-
/* DFS is not supported with multi-channel combinations yet */
for (i = 0; i < local->hw.wiphy->n_iface_combinations; i++) {
const struct ieee80211_iface_combination *comb;
@@ -1168,7 +1156,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT;
}
- local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CSA_COUNTERS_NUM;
+ local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CNTDWN_COUNTERS_NUM;
/*
* We use the number of queues for feature tests (QoS, HT) internally
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 7ecd801a943b..97095b7c9c64 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -667,12 +667,41 @@ void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh)
}
}
+static void
+ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata,
+ u8 *ie, u8 ie_len)
+{
+ struct ieee80211_supported_band *sband;
+ const u8 *cap;
+ const struct ieee80211_he_operation *he_oper = NULL;
+
+ sband = ieee80211_get_sband(sdata);
+ if (!sband)
+ return;
+
+ if (!ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT) ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+ return;
+
+ sdata->vif.bss_conf.he_support = true;
+
+ cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ie, ie_len);
+ if (cap && cap[1] >= ieee80211_he_oper_size(&cap[3]))
+ he_oper = (void *)(cap + 3);
+
+ if (he_oper)
+ sdata->vif.bss_conf.he_oper.params =
+ __le32_to_cpu(he_oper->he_oper_params);
+}
+
/**
* ieee80211_fill_mesh_addresses - fill addresses of a locally originated mesh frame
* @hdr: 802.11 frame header
* @fc: frame control field
* @meshda: destination address in the mesh
- * @meshsa: source address address in the mesh. Same as TA, as frame is
+ * @meshsa: source address in the mesh. Same as TA, as frame is
* locally originated.
*
* Return the length of the 802.11 (does not include a mesh control header)
@@ -864,8 +893,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
*pos++ = 0x0;
*pos++ = ieee80211_frequency_to_channel(
csa->settings.chandef.chan->center_freq);
- bcn->csa_current_counter = csa->settings.count;
- bcn->csa_counter_offsets[0] = hdr_len + 6;
+ bcn->cntdwn_current_counter = csa->settings.count;
+ bcn->cntdwn_counter_offsets[0] = hdr_len + 6;
*pos++ = csa->settings.count;
*pos++ = WLAN_EID_CHAN_SWITCH_PARAM;
*pos++ = 6;
@@ -943,6 +972,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
bcn->tail_len = skb->len;
memcpy(bcn->tail, skb->data, bcn->tail_len);
+ ieee80211_mesh_update_bss_params(sdata, bcn->tail, bcn->tail_len);
bcn->meshconf = (struct ieee80211_meshconf_ie *)
(bcn->tail + ifmsh->meshconf_offset);
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index bec23d2eee7a..313eee12410e 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -212,7 +212,7 @@ static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata,
skb->priority = 7;
info->control.vif = &sdata->vif;
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
ieee80211_set_qos_hdr(sdata, skb);
ieee80211_mps_set_frame_flags(sdata, NULL, hdr);
}
@@ -1163,7 +1163,7 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata,
if (skb_queue_len(&mpath->frame_queue) >= MESH_FRAME_QUEUE_LEN)
skb_to_free = skb_dequeue(&mpath->frame_queue);
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
ieee80211_set_qos_hdr(sdata, skb);
skb_queue_tail(&mpath->frame_queue, skb);
if (skb_to_free)
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 48f31ac9233c..620ecf922408 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -60,6 +60,7 @@ static struct mesh_table *mesh_table_alloc(void)
atomic_set(&newtbl->entries, 0);
spin_lock_init(&newtbl->gates_lock);
spin_lock_init(&newtbl->walk_lock);
+ rhashtable_init(&newtbl->rhead, &mesh_rht_params);
return newtbl;
}
@@ -773,9 +774,6 @@ int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata)
goto free_path;
}
- rhashtable_init(&tbl_path->rhead, &mesh_rht_params);
- rhashtable_init(&tbl_mpp->rhead, &mesh_rht_params);
-
sdata->u.mesh.mesh_paths = tbl_path;
sdata->u.mesh.mpp_paths = tbl_mpp;
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 15f2fc658f70..aca26df7587d 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -144,6 +144,7 @@ out:
/**
* mesh_set_ht_prot_mode - set correct HT protection mode
+ * @sdata: the (mesh) interface to handle
*
* Section 9.23.3.5 of IEEE 80211-2012 describes the protection rules for HT
* mesh STA in a MBSS. Three HT protection modes are supported for now, non-HT
diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c
index 031e905f684a..204830a55240 100644
--- a/net/mac80211/mesh_ps.c
+++ b/net/mac80211/mesh_ps.c
@@ -12,6 +12,7 @@
/**
* mps_qos_null_get - create pre-addressed QoS Null frame for mesh powersave
+ * @sta: the station to get the frame for
*/
static struct sk_buff *mps_qos_null_get(struct sta_info *sta)
{
@@ -44,6 +45,7 @@ static struct sk_buff *mps_qos_null_get(struct sta_info *sta)
/**
* mps_qos_null_tx - send a QoS Null to indicate link-specific power mode
+ * @sta: the station to send to
*/
static void mps_qos_null_tx(struct sta_info *sta)
{
@@ -400,6 +402,8 @@ static void mpsp_trigger_send(struct sta_info *sta, bool rspi, bool eosp)
/**
* mpsp_qos_null_append - append QoS Null frame to MPSP skb queue if needed
+ * @sta: the station to handle
+ * @frames: the frame list to append to
*
* To properly end a mesh MPSP the last transmitted frame has to set the EOSP
* flag in the QoS Control field. In case the current tailing frame is not a
@@ -432,7 +436,7 @@ static void mpsp_qos_null_append(struct sta_info *sta,
info = IEEE80211_SKB_CB(new_skb);
info->control.vif = &sdata->vif;
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
__skb_queue_tail(frames, new_skb);
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2e400b0ff696..0e4d950cf907 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -149,6 +149,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
const struct ieee80211_ht_operation *ht_oper,
const struct ieee80211_vht_operation *vht_oper,
const struct ieee80211_he_operation *he_oper,
+ const struct ieee80211_s1g_oper_ie *s1g_oper,
struct cfg80211_chan_def *chandef, bool tracking)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -171,6 +172,18 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
ret = 0;
vht_chandef = *chandef;
goto out;
+ } else if (sband->band == NL80211_BAND_S1GHZ) {
+ if (!ieee80211_chandef_s1g_oper(s1g_oper, chandef)) {
+ sdata_info(sdata,
+ "Missing S1G Operation Element? Trying operating == primary\n");
+ chandef->width = ieee80211_s1g_channel_width(channel);
+ }
+
+ ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_40MHZ |
+ IEEE80211_STA_DISABLE_VHT |
+ IEEE80211_STA_DISABLE_80P80MHZ |
+ IEEE80211_STA_DISABLE_160MHZ;
+ goto out;
}
memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap));
@@ -347,6 +360,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
const struct ieee80211_ht_operation *ht_oper,
const struct ieee80211_vht_operation *vht_oper,
const struct ieee80211_he_operation *he_oper,
+ const struct ieee80211_s1g_oper_ie *s1g_oper,
const u8 *bssid, u32 *changed)
{
struct ieee80211_local *local = sdata->local;
@@ -393,7 +407,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
/* calculate new channel (type) based on HT/VHT/HE operation IEs */
flags = ieee80211_determine_chantype(sdata, sband, chan, vht_cap_info,
ht_oper, vht_oper, he_oper,
- &chandef, true);
+ s1g_oper, &chandef, true);
/*
* Downgrade the new channel if we associated with restricted
@@ -696,6 +710,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
struct ieee80211_chanctx_conf *chanctx_conf;
struct ieee80211_channel *chan;
u32 rates = 0;
+ __le16 listen_int;
struct element *ext_capa = NULL;
/* we know it's writable, cast away the const */
@@ -784,13 +799,15 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
memcpy(mgmt->bssid, assoc_data->bss->bssid, ETH_ALEN);
+ listen_int = cpu_to_le16(sband->band == NL80211_BAND_S1GHZ ?
+ ieee80211_encode_usf(local->hw.conf.listen_interval) :
+ local->hw.conf.listen_interval);
if (!is_zero_ether_addr(assoc_data->prev_bssid)) {
skb_put(skb, 10);
mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
IEEE80211_STYPE_REASSOC_REQ);
mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab);
- mgmt->u.reassoc_req.listen_interval =
- cpu_to_le16(local->hw.conf.listen_interval);
+ mgmt->u.reassoc_req.listen_interval = listen_int;
memcpy(mgmt->u.reassoc_req.current_ap, assoc_data->prev_bssid,
ETH_ALEN);
} else {
@@ -798,8 +815,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
IEEE80211_STYPE_ASSOC_REQ);
mgmt->u.assoc_req.capab_info = cpu_to_le16(capab);
- mgmt->u.assoc_req.listen_interval =
- cpu_to_le16(local->hw.conf.listen_interval);
+ mgmt->u.assoc_req.listen_interval = listen_int;
}
/* SSID */
@@ -809,6 +825,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
*pos++ = assoc_data->ssid_len;
memcpy(pos, assoc_data->ssid, assoc_data->ssid_len);
+ if (sband->band == NL80211_BAND_S1GHZ)
+ goto skip_rates;
+
/* add all rates which were marked to be used above */
supp_rates_len = rates_len;
if (supp_rates_len > 8)
@@ -844,6 +863,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
}
}
+skip_rates:
if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT ||
capab & WLAN_CAPABILITY_RADIO_MEASURE) {
pos = skb_put(skb, 4);
@@ -1018,6 +1038,11 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
pos = ieee80211_add_wmm_info_ie(skb_put(skb, 9), qos_info);
}
+ if (sband->band == NL80211_BAND_S1GHZ) {
+ ieee80211_add_aid_request_ie(sdata, skb);
+ ieee80211_add_s1g_capab_ie(sdata, &sband->s1g_cap, skb);
+ }
+
/* add any remaining custom (i.e. vendor specific here) IEs */
if (assoc_data->ie_len) {
noffset = assoc_data->ie_len;
@@ -1392,6 +1417,17 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
return;
}
+ if (sdata->vif.bss_conf.chandef.chan->band !=
+ csa_ie.chandef.chan->band) {
+ sdata_info(sdata,
+ "AP %pM switches to different band (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n",
+ ifmgd->associated->bssid,
+ csa_ie.chandef.chan->center_freq,
+ csa_ie.chandef.width, csa_ie.chandef.center_freq1,
+ csa_ie.chandef.center_freq2);
+ goto lock_and_drop_connection;
+ }
+
if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chandef,
IEEE80211_CHAN_DISABLED)) {
sdata_info(sdata,
@@ -1404,9 +1440,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
csa_ie.chandef.width, csa_ie.chandef.center_freq1,
csa_ie.chandef.freq1_offset,
csa_ie.chandef.center_freq2);
- ieee80211_queue_work(&local->hw,
- &ifmgd->csa_connection_drop_work);
- return;
+ goto lock_and_drop_connection;
}
if (cfg80211_chandef_identical(&csa_ie.chandef,
@@ -1468,6 +1502,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
sdata->csa_chandef = csa_ie.chandef;
sdata->csa_block_tx = csa_ie.mode;
ifmgd->csa_ignored_same_chan = false;
+ ifmgd->beacon_crc_valid = false;
if (sdata->csa_block_tx)
ieee80211_stop_vif_queues(local, sdata,
@@ -1475,7 +1510,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
mutex_unlock(&local->mtx);
cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chandef,
- csa_ie.count);
+ csa_ie.count, csa_ie.mode);
if (local->ops->channel_switch) {
/* use driver's channel switch callback */
@@ -1491,6 +1526,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
TU_TO_EXP_TIME((csa_ie.count - 1) *
cbss->beacon_interval));
return;
+ lock_and_drop_connection:
+ mutex_lock(&local->mtx);
+ mutex_lock(&local->chanctx_mtx);
drop_connection:
/*
* This is just so that the disconnect flow will know that
@@ -1535,9 +1573,17 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata,
chan_increment = 1;
break;
case NL80211_BAND_5GHZ:
- case NL80211_BAND_6GHZ:
chan_increment = 4;
break;
+ case NL80211_BAND_6GHZ:
+ /*
+ * In the 6 GHz band, the "maximum transmit power level"
+ * field in the triplets is reserved, and thus will be
+ * zero and we shouldn't use it to control TX power.
+ * The actual TX power will be given in the transmit
+ * power envelope element instead.
+ */
+ return false;
}
/* find channel */
@@ -1597,6 +1643,9 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata,
int new_ap_level;
__le16 capab = mgmt->u.probe_resp.capab_info;
+ if (ieee80211_is_s1g_beacon(mgmt->frame_control))
+ return 0; /* TODO */
+
if (country_ie &&
(capab & cpu_to_le16(WLAN_CAPABILITY_SPECTRUM_MGMT) ||
capab & cpu_to_le16(WLAN_CAPABILITY_RADIO_MEASURE))) {
@@ -2354,6 +2403,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
/* clear bssid only after building the needed mgmt frames */
eth_zero_addr(ifmgd->bssid);
+ sdata->vif.bss_conf.ssid_len = 0;
+
/* remove AP and TDLS peers */
sta_info_flush(sdata);
@@ -2432,23 +2483,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
}
-void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_hdr *hdr)
-{
- /*
- * We can postpone the mgd.timer whenever receiving unicast frames
- * from AP because we know that the connection is working both ways
- * at that time. But multicast frames (and hence also beacons) must
- * be ignored here, because we need to trigger the timer during
- * data idle periods for sending the periodic probe request to the
- * AP we're connected to.
- */
- if (is_multicast_ether_addr(hdr->addr1))
- return;
-
- ieee80211_sta_reset_conn_monitor(sdata);
-}
-
static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -2521,21 +2555,15 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
{
ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time);
- if (!ieee80211_is_data(hdr->frame_control))
- return;
-
- if (ieee80211_is_any_nullfunc(hdr->frame_control) &&
- sdata->u.mgd.probe_send_count > 0) {
- if (ack)
- ieee80211_sta_reset_conn_monitor(sdata);
- else
- sdata->u.mgd.nullfunc_failed = true;
- ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+ if (!ieee80211_is_any_nullfunc(hdr->frame_control) ||
+ !sdata->u.mgd.probe_send_count)
return;
- }
if (ack)
- ieee80211_sta_reset_conn_monitor(sdata);
+ sdata->u.mgd.probe_send_count = 0;
+ else
+ sdata->u.mgd.nullfunc_failed = true;
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
}
static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata,
@@ -2715,7 +2743,7 @@ EXPORT_SYMBOL(ieee80211_ap_probereq_get);
static void ieee80211_report_disconnect(struct ieee80211_sub_if_data *sdata,
const u8 *buf, size_t len, bool tx,
- u16 reason)
+ u16 reason, bool reconnect)
{
struct ieee80211_event event = {
.type = MLME_EVENT,
@@ -2724,7 +2752,7 @@ static void ieee80211_report_disconnect(struct ieee80211_sub_if_data *sdata,
};
if (tx)
- cfg80211_tx_mlme_mgmt(sdata->dev, buf, len);
+ cfg80211_tx_mlme_mgmt(sdata->dev, buf, len, reconnect);
else
cfg80211_rx_mlme_mgmt(sdata->dev, buf, len);
@@ -2746,13 +2774,18 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
tx = !sdata->csa_block_tx;
- /* AP is probably out of range (or not reachable for another reason) so
- * remove the bss struct for that AP.
- */
- cfg80211_unlink_bss(local->hw.wiphy, ifmgd->associated);
+ if (!ifmgd->driver_disconnect) {
+ /*
+ * AP is probably out of range (or not reachable for another
+ * reason) so remove the bss struct for that AP.
+ */
+ cfg80211_unlink_bss(local->hw.wiphy, ifmgd->associated);
+ }
ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
- WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY,
+ ifmgd->driver_disconnect ?
+ WLAN_REASON_DEAUTH_LEAVING :
+ WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY,
tx, frame_buf);
mutex_lock(&local->mtx);
sdata->vif.csa_active = false;
@@ -2765,7 +2798,9 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
mutex_unlock(&local->mtx);
ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), tx,
- WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY);
+ WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY,
+ ifmgd->reconnect);
+ ifmgd->reconnect = false;
sdata_unlock(sdata);
}
@@ -2784,6 +2819,13 @@ static void ieee80211_beacon_connection_loss_work(struct work_struct *work)
sdata_info(sdata, "Connection to AP %pM lost\n",
ifmgd->bssid);
__ieee80211_disconnect(sdata);
+ ifmgd->connection_loss = false;
+ } else if (ifmgd->driver_disconnect) {
+ sdata_info(sdata,
+ "Driver requested disconnection from AP %pM\n",
+ ifmgd->bssid);
+ __ieee80211_disconnect(sdata);
+ ifmgd->driver_disconnect = false;
} else {
ieee80211_mgd_probe_ap(sdata, true);
}
@@ -2822,6 +2864,21 @@ void ieee80211_connection_loss(struct ieee80211_vif *vif)
}
EXPORT_SYMBOL(ieee80211_connection_loss);
+void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_hw *hw = &sdata->local->hw;
+
+ trace_api_disconnect(sdata, reconnect);
+
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
+ return;
+
+ sdata->u.mgd.driver_disconnect = true;
+ sdata->u.mgd.reconnect = reconnect;
+ ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work);
+}
+EXPORT_SYMBOL(ieee80211_disconnect);
static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
bool assoc)
@@ -3125,7 +3182,7 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
ieee80211_set_disassoc(sdata, 0, 0, false, NULL);
ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false,
- reason_code);
+ reason_code, false);
return;
}
@@ -3174,7 +3231,8 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
ieee80211_set_disassoc(sdata, 0, 0, false, NULL);
- ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code);
+ ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code,
+ false);
}
static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
@@ -3194,8 +3252,8 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
*have_higher_than_11mbit = true;
/*
- * Skip HT, VHT and HE BSS membership selectors since they're
- * not rates.
+ * Skip HT, VHT, HE and SAE H2E only BSS membership selectors
+ * since they're not rates.
*
* Note: Even though the membership selector and the basic
* rate flag share the same bit, they are not exactly
@@ -3203,7 +3261,8 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
*/
if (supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HT_PHY) ||
supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY) ||
- supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY))
+ supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY) ||
+ supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_SAE_H2E))
continue;
for (j = 0; j < sband->n_bitrates; j++) {
@@ -3267,14 +3326,26 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
const struct cfg80211_bss_ies *bss_ies = NULL;
struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data;
bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ;
+ bool is_s1g = cbss->channel->band == NL80211_BAND_S1GHZ;
u32 changed = 0;
+ u8 *pos;
int err;
bool ret;
/* AssocResp and ReassocResp have identical structure */
+ pos = mgmt->u.assoc_resp.variable;
aid = le16_to_cpu(mgmt->u.assoc_resp.aid);
+ if (is_s1g) {
+ pos = (u8 *) mgmt->u.s1g_assoc_resp.variable;
+ aid = 0; /* TODO */
+ }
capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
+ ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, elems,
+ mgmt->bssid, assoc_data->bss->bssid);
+
+ if (elems->aid_resp)
+ aid = le16_to_cpu(elems->aid_resp->aid);
/*
* The 5 MSB of the AID field are reserved
@@ -3291,7 +3362,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
ifmgd->broken_ap = true;
}
- if (!elems->supp_rates) {
+ if (!is_s1g && !elems->supp_rates) {
sdata_info(sdata, "no SuppRates element in AssocResp\n");
return false;
}
@@ -3477,14 +3548,6 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
le32_get_bits(elems->he_operation->he_oper_params,
IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK);
- bss_conf->multi_sta_back_32bit =
- sta->sta.he_cap.he_cap_elem.mac_cap_info[2] &
- IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP;
-
- bss_conf->ack_enabled =
- sta->sta.he_cap.he_cap_elem.mac_cap_info[2] &
- IEEE80211_HE_MAC_CAP2_ACK_EN;
-
bss_conf->uora_exists = !!elems->uora_element;
if (elems->uora_element)
bss_conf->uora_ocw_range = elems->uora_element[0];
@@ -3533,7 +3596,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
sta->sta.mfp = false;
}
- sta->sta.wme = elems->wmm_param && local->hw.queues >= IEEE80211_NUM_ACS;
+ sta->sta.wme = (elems->wmm_param || elems->s1g_capab) &&
+ local->hw.queues >= IEEE80211_NUM_ACS;
err = sta_info_move_state(sta, IEEE80211_STA_ASSOC);
if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT))
@@ -3548,6 +3612,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
goto out;
}
+ if (sdata->wdev.use_4addr)
+ drv_sta_set_4addr(local, sdata, &sta->sta, true);
+
mutex_unlock(&sdata->local->sta_mtx);
/*
@@ -3605,8 +3672,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
* Start timer to probe the connection to the AP now.
* Also start the timer that will detect beacon loss.
*/
- ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt);
ieee80211_sta_reset_beacon_monitor(sdata);
+ ieee80211_sta_reset_conn_monitor(sdata);
ret = true;
out:
@@ -3625,7 +3692,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
int ac, uapsd_queues = -1;
u8 *pos;
bool reassoc;
- struct cfg80211_bss *bss;
+ struct cfg80211_bss *cbss;
struct ieee80211_event event = {
.type = MLME_EVENT,
.u.mlme.data = ASSOC_EVENT,
@@ -3635,9 +3702,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
if (!assoc_data)
return;
+
if (!ether_addr_equal(assoc_data->bss->bssid, mgmt->bssid))
return;
+ cbss = assoc_data->bss;
+
/*
* AssocResp and ReassocResp have identical structure, so process both
* of them in this function.
@@ -3649,7 +3719,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control);
capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code);
+ pos = mgmt->u.assoc_resp.variable;
aid = le16_to_cpu(mgmt->u.assoc_resp.aid);
+ if (cbss->channel->band == NL80211_BAND_S1GHZ) {
+ pos = (u8 *) mgmt->u.s1g_assoc_resp.variable;
+ aid = 0; /* TODO */
+ }
sdata_info(sdata,
"RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n",
@@ -3660,7 +3735,6 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
fils_decrypt_assoc_resp(sdata, (u8 *)mgmt, &len, assoc_data) < 0)
return;
- pos = mgmt->u.assoc_resp.variable;
ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems,
mgmt->bssid, assoc_data->bss->bssid);
@@ -3680,8 +3754,6 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
return;
}
- bss = assoc_data->bss;
-
if (status_code != WLAN_STATUS_SUCCESS) {
sdata_info(sdata, "%pM denied association (code=%d)\n",
mgmt->sa, status_code);
@@ -3690,10 +3762,10 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
event.u.mlme.reason = status_code;
drv_event_callback(sdata->local, sdata, &event);
} else {
- if (!ieee80211_assoc_success(sdata, bss, mgmt, len, &elems)) {
+ if (!ieee80211_assoc_success(sdata, cbss, mgmt, len, &elems)) {
/* oops -- internal error -- send timeout for now */
ieee80211_destroy_assoc_data(sdata, false, false);
- cfg80211_assoc_timeout(sdata->dev, bss);
+ cfg80211_assoc_timeout(sdata->dev, cbss);
return;
}
event.u.mlme.status = MLME_SUCCESS;
@@ -3714,7 +3786,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
uapsd_queues |= ieee80211_ac_to_qos_mask[ac];
}
- cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len, uapsd_queues,
+ cfg80211_rx_assoc_resp(sdata->dev, cbss, (u8 *)mgmt, len, uapsd_queues,
ifmgd->assoc_req_ies, ifmgd->assoc_req_ies_len);
}
@@ -3913,11 +3985,12 @@ static bool ieee80211_rx_our_beacon(const u8 *tx_bssid,
}
static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee80211_hdr *hdr, size_t len,
struct ieee80211_rx_status *rx_status)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+ struct ieee80211_mgmt *mgmt = (void *) hdr;
size_t baselen;
struct ieee802_11_elems elems;
struct ieee80211_local *local = sdata->local;
@@ -3927,14 +4000,24 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
u32 changed = 0;
bool erp_valid;
u8 erp_value = 0;
- u32 ncrc;
- u8 *bssid;
+ u32 ncrc = 0;
+ u8 *bssid, *variable = mgmt->u.beacon.variable;
u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN];
sdata_assert_lock(sdata);
/* Process beacon from the current BSS */
- baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt;
+ bssid = ieee80211_get_bssid(hdr, len, sdata->vif.type);
+ if (ieee80211_is_s1g_beacon(mgmt->frame_control)) {
+ struct ieee80211_ext *ext = (void *) mgmt;
+
+ if (ieee80211_is_s1g_short_beacon(ext->frame_control))
+ variable = ext->u.s1g_short_beacon.variable;
+ else
+ variable = ext->u.s1g_beacon.variable;
+ }
+
+ baselen = (u8 *) variable - (u8 *) mgmt;
if (baselen > len)
return;
@@ -3954,10 +4037,10 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
rcu_read_unlock();
if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon &&
- ieee80211_rx_our_beacon(mgmt->bssid, ifmgd->assoc_data->bss)) {
- ieee802_11_parse_elems(mgmt->u.beacon.variable,
+ ieee80211_rx_our_beacon(bssid, ifmgd->assoc_data->bss)) {
+ ieee802_11_parse_elems(variable,
len - baselen, false, &elems,
- mgmt->bssid,
+ bssid,
ifmgd->assoc_data->bss->bssid);
ieee80211_rx_bss_info(sdata, mgmt, len, rx_status);
@@ -3990,7 +4073,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
}
if (!ifmgd->associated ||
- !ieee80211_rx_our_beacon(mgmt->bssid, ifmgd->associated))
+ !ieee80211_rx_our_beacon(bssid, ifmgd->associated))
return;
bssid = ifmgd->associated->bssid;
@@ -4010,8 +4093,14 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
*/
ieee80211_sta_reset_beacon_monitor(sdata);
- ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4);
- ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable,
+ /* TODO: CRC urrently not calculated on S1G Beacon Compatibility
+ * element (which carries the beacon interval). Don't forget to add a
+ * bit to care_about_ies[] above if mac80211 is interested in a
+ * changing S1G element.
+ */
+ if (!ieee80211_is_s1g_beacon(hdr->frame_control))
+ ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4);
+ ncrc = ieee802_11_parse_elems_crc(variable,
len - baselen, false, &elems,
care_about_ies, ncrc,
mgmt->bssid, bssid);
@@ -4045,7 +4134,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
struct ieee80211_p2p_noa_attr noa = {};
int ret;
- ret = cfg80211_get_p2p_attr(mgmt->u.beacon.variable,
+ ret = cfg80211_get_p2p_attr(variable,
len - baselen,
IEEE80211_P2P_ATTR_ABSENCE_NOTICE,
(u8 *) &noa, sizeof(noa));
@@ -4081,7 +4170,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
* the driver will use them. The synchronized view is currently
* guaranteed only in certain callbacks.
*/
- if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) {
+ if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY) &&
+ !ieee80211_is_s1g_beacon(hdr->frame_control)) {
sdata->vif.bss_conf.sync_tsf =
le64_to_cpu(mgmt->u.beacon.timestamp);
sdata->vif.bss_conf.sync_device_ts =
@@ -4089,7 +4179,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.sync_dtim_count = elems.dtim_count;
}
- if (ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid)
+ if ((ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid) ||
+ ieee80211_is_s1g_short_beacon(mgmt->frame_control))
return;
ifmgd->beacon_crc = ncrc;
ifmgd->beacon_crc_valid = true;
@@ -4130,9 +4221,11 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
} else {
erp_valid = false;
}
- changed |= ieee80211_handle_bss_capability(sdata,
- le16_to_cpu(mgmt->u.beacon.capab_info),
- erp_valid, erp_value);
+
+ if (!ieee80211_is_s1g_beacon(hdr->frame_control))
+ changed |= ieee80211_handle_bss_capability(sdata,
+ le16_to_cpu(mgmt->u.beacon.capab_info),
+ erp_valid, erp_value);
mutex_lock(&local->sta_mtx);
sta = sta_info_get(sdata, bssid);
@@ -4142,7 +4235,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem,
elems.vht_cap_elem, elems.ht_operation,
elems.vht_operation, elems.he_operation,
- bssid, &changed)) {
+ elems.s1g_oper, bssid, &changed)) {
mutex_unlock(&local->sta_mtx);
sdata_info(sdata,
"failed to follow AP %pM bandwidth change, disconnect\n",
@@ -4152,7 +4245,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
true, deauth_buf);
ieee80211_report_disconnect(sdata, deauth_buf,
sizeof(deauth_buf), true,
- WLAN_REASON_DEAUTH_LEAVING);
+ WLAN_REASON_DEAUTH_LEAVING,
+ false);
return;
}
@@ -4170,6 +4264,26 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
ieee80211_bss_info_change_notify(sdata, changed);
}
+void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_rx_status *rx_status;
+ struct ieee80211_hdr *hdr;
+ u16 fc;
+
+ rx_status = (struct ieee80211_rx_status *) skb->cb;
+ hdr = (struct ieee80211_hdr *) skb->data;
+ fc = le16_to_cpu(hdr->frame_control);
+
+ sdata_lock(sdata);
+ switch (fc & IEEE80211_FCTL_STYPE) {
+ case IEEE80211_STYPE_S1G_BEACON:
+ ieee80211_rx_mgmt_beacon(sdata, hdr, skb->len, rx_status);
+ break;
+ }
+ sdata_unlock(sdata);
+}
+
void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb)
{
@@ -4187,7 +4301,8 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
switch (fc & IEEE80211_FCTL_STYPE) {
case IEEE80211_STYPE_BEACON:
- ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, rx_status);
+ ieee80211_rx_mgmt_beacon(sdata, (void *)mgmt,
+ skb->len, rx_status);
break;
case IEEE80211_STYPE_PROBE_RESP:
ieee80211_rx_mgmt_probe_resp(sdata, skb);
@@ -4276,7 +4391,7 @@ static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata,
tx, frame_buf);
ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true,
- reason);
+ reason, false);
}
static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
@@ -4577,10 +4692,26 @@ static void ieee80211_sta_conn_mon_timer(struct timer_list *t)
from_timer(sdata, t, u.mgd.conn_mon_timer);
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ unsigned long timeout;
if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn)
return;
+ sta = sta_info_get(sdata, ifmgd->bssid);
+ if (!sta)
+ return;
+
+ timeout = sta->status_stats.last_ack;
+ if (time_before(sta->status_stats.last_ack, sta->rx_stats.last_rx))
+ timeout = sta->rx_stats.last_rx;
+ timeout += IEEE80211_CONNECTION_IDLE_TIME;
+
+ if (time_is_before_jiffies(timeout)) {
+ mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(timeout));
+ return;
+ }
+
ieee80211_queue_work(&local->hw, &ifmgd->monitor_work);
}
@@ -4632,7 +4763,8 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata)
if (ifmgd->auth_data)
ieee80211_destroy_auth_data(sdata, false);
cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf,
- IEEE80211_DEAUTH_FRAME_LEN);
+ IEEE80211_DEAUTH_FRAME_LEN,
+ false);
}
/* This is a bit of a hack - we should find a better and more generic
@@ -4858,6 +4990,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
const struct ieee80211_ht_operation *ht_oper = NULL;
const struct ieee80211_vht_operation *vht_oper = NULL;
const struct ieee80211_he_operation *he_oper = NULL;
+ const struct ieee80211_s1g_oper_ie *s1g_oper = NULL;
struct ieee80211_supported_band *sband;
struct cfg80211_chan_def chandef;
bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ;
@@ -4961,10 +5094,23 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
if (!have_80mhz)
ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ if (sband->band == NL80211_BAND_S1GHZ) {
+ const u8 *s1g_oper_ie;
+
+ s1g_oper_ie = ieee80211_bss_get_ie(cbss,
+ WLAN_EID_S1G_OPERATION);
+ if (s1g_oper_ie && s1g_oper_ie[1] >= sizeof(*s1g_oper))
+ s1g_oper = (void *)(s1g_oper_ie + 2);
+ else
+ sdata_info(sdata,
+ "AP missing S1G operation element?\n");
+ }
+
ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
cbss->channel,
bss->vht_cap_info,
ht_oper, vht_oper, he_oper,
+ s1g_oper,
&chandef, false);
sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss),
@@ -5091,6 +5237,12 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
const struct cfg80211_bss_ies *ies;
int shift = ieee80211_vif_get_shift(&sdata->vif);
+ /* TODO: S1G Basic Rate Set is expressed elsewhere */
+ if (cbss->channel->band == NL80211_BAND_S1GHZ) {
+ ieee80211_s1g_sta_rate_init(new_sta);
+ goto skip_rates;
+ }
+
ieee80211_get_rates(sband, bss->supp_rates,
bss->supp_rates_len,
&rates, &basic_rates,
@@ -5135,6 +5287,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
else
sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
+skip_rates:
memcpy(ifmgd->bssid, cbss->bssid, ETH_ALEN);
/* set timing information */
@@ -5325,7 +5478,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
ieee80211_report_disconnect(sdata, frame_buf,
sizeof(frame_buf), true,
- WLAN_REASON_UNSPECIFIED);
+ WLAN_REASON_UNSPECIFIED,
+ false);
}
sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid);
@@ -5359,12 +5513,14 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
struct cfg80211_assoc_request *req)
{
bool is_6ghz = req->bss->channel->band == NL80211_BAND_6GHZ;
+ bool is_5ghz = req->bss->channel->band == NL80211_BAND_5GHZ;
struct ieee80211_local *local = sdata->local;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_bss *bss = (void *)req->bss->priv;
struct ieee80211_mgd_assoc_data *assoc_data;
const struct cfg80211_bss_ies *beacon_ies;
struct ieee80211_supported_band *sband;
+ struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
const u8 *ssidie, *ht_ie, *vht_ie;
int i, err;
bool override = false;
@@ -5382,6 +5538,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
}
memcpy(assoc_data->ssid, ssidie + 2, ssidie[1]);
assoc_data->ssid_len = ssidie[1];
+ memcpy(bss_conf->ssid, assoc_data->ssid, assoc_data->ssid_len);
+ bss_conf->ssid_len = assoc_data->ssid_len;
rcu_read_unlock();
if (ifmgd->associated) {
@@ -5396,7 +5554,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
ieee80211_report_disconnect(sdata, frame_buf,
sizeof(frame_buf), true,
- WLAN_REASON_UNSPECIFIED);
+ WLAN_REASON_UNSPECIFIED,
+ false);
}
if (ifmgd->auth_data && !ifmgd->auth_data->done) {
@@ -5462,6 +5621,10 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
memcpy(&ifmgd->vht_capa_mask, &req->vht_capa_mask,
sizeof(ifmgd->vht_capa_mask));
+ memcpy(&ifmgd->s1g_capa, &req->s1g_capa, sizeof(ifmgd->s1g_capa));
+ memcpy(&ifmgd->s1g_capa_mask, &req->s1g_capa_mask,
+ sizeof(ifmgd->s1g_capa_mask));
+
if (req->ie && req->ie_len) {
memcpy(assoc_data->ie, req->ie, req->ie_len);
assoc_data->ie_len = req->ie_len;
@@ -5507,7 +5670,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
if (vht_ie && vht_ie[1] >= sizeof(struct ieee80211_vht_cap))
memcpy(&assoc_data->ap_vht_cap, vht_ie + 2,
sizeof(struct ieee80211_vht_cap));
- else if (!is_6ghz)
+ else if (is_5ghz)
ifmgd->flags |= IEEE80211_STA_DISABLE_VHT |
IEEE80211_STA_DISABLE_HE;
rcu_read_unlock();
@@ -5691,7 +5854,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
ieee80211_destroy_auth_data(sdata, false);
ieee80211_report_disconnect(sdata, frame_buf,
sizeof(frame_buf), true,
- req->reason_code);
+ req->reason_code, false);
return 0;
}
@@ -5711,7 +5874,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
ieee80211_destroy_assoc_data(sdata, false, true);
ieee80211_report_disconnect(sdata, frame_buf,
sizeof(frame_buf), true,
- req->reason_code);
+ req->reason_code, false);
return 0;
}
@@ -5726,7 +5889,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
req->reason_code, tx, frame_buf);
ieee80211_report_disconnect(sdata, frame_buf,
sizeof(frame_buf), true,
- req->reason_code);
+ req->reason_code, false);
return 0;
}
@@ -5759,7 +5922,7 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
frame_buf);
ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true,
- req->reason_code);
+ req->reason_code, false);
return 0;
}
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index f470d1a7ce9b..853c9a369d72 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -26,8 +26,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
-
- local->offchannel_ps_enabled = false;
+ bool offchannel_ps_enabled = false;
/* FIXME: what to do when local->pspolling is true? */
@@ -38,12 +37,12 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata)
cancel_work_sync(&local->dynamic_ps_enable_work);
if (local->hw.conf.flags & IEEE80211_CONF_PS) {
- local->offchannel_ps_enabled = true;
+ offchannel_ps_enabled = true;
local->hw.conf.flags &= ~IEEE80211_CONF_PS;
ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
}
- if (!local->offchannel_ps_enabled ||
+ if (!offchannel_ps_enabled ||
!ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK))
/*
* If power save was enabled, no need to send a nullfunc
@@ -58,38 +57,19 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata)
ieee80211_send_nullfunc(local, sdata, true);
}
-/* inform AP that we are awake again, unless power save is enabled */
+/* inform AP that we are awake again */
static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_local *local = sdata->local;
if (!local->ps_sdata)
ieee80211_send_nullfunc(local, sdata, false);
- else if (local->offchannel_ps_enabled) {
- /*
- * In !IEEE80211_HW_PS_NULLFUNC_STACK case the hardware
- * will send a nullfunc frame with the powersave bit set
- * even though the AP already knows that we are sleeping.
- * This could be avoided by sending a null frame with power
- * save bit disabled before enabling the power save, but
- * this doesn't gain anything.
- *
- * When IEEE80211_HW_PS_NULLFUNC_STACK is enabled, no need
- * to send a nullfunc frame because AP already knows that
- * we are sleeping, let's just enable power save mode in
- * hardware.
- */
- /* TODO: Only set hardware if CONF_PS changed?
- * TODO: Should we set offchannel_ps_enabled to false?
- */
- local->hw.conf.flags |= IEEE80211_CONF_PS;
- ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
- } else if (local->hw.conf.dynamic_ps_timeout > 0) {
+ else if (local->hw.conf.dynamic_ps_timeout > 0) {
/*
- * If IEEE80211_CONF_PS was not set and the dynamic_ps_timer
- * had been running before leaving the operating channel,
- * restart the timer now and send a nullfunc frame to inform
- * the AP that we are awake.
+ * the dynamic_ps_timer had been running before leaving the
+ * operating channel, restart the timer now and send a nullfunc
+ * frame to inform the AP that we are awake so that AP sends
+ * the buffered packets (if any).
*/
ieee80211_send_nullfunc(local, sdata, false);
mod_timer(&local->dynamic_ps_timer, jiffies +
@@ -916,7 +896,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
if (beacon)
for (i = 0; i < params->n_csa_offsets; i++)
data[params->csa_offsets[i]] =
- beacon->csa_current_counter;
+ beacon->cntdwn_current_counter;
rcu_read_unlock();
}
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index 38c45e1dafd8..ae378a41c927 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -150,21 +150,6 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
case NL80211_IFTYPE_STATION:
ieee80211_mgd_quiesce(sdata);
break;
- case NL80211_IFTYPE_WDS:
- /* tear down aggregation sessions and remove STAs */
- mutex_lock(&local->sta_mtx);
- sta = sdata->u.wds.sta;
- if (sta && sta->uploaded) {
- enum ieee80211_sta_state state;
-
- state = sta->sta_state;
- for (; state > IEEE80211_STA_NOTEXIST; state--)
- WARN_ON(drv_sta_state(local, sta->sdata,
- sta, state,
- state - 1));
- }
- mutex_unlock(&local->sta_mtx);
- break;
default:
break;
}
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index b051f125d3af..63652c39c8e0 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -51,6 +51,13 @@ void rate_control_rate_init(struct sta_info *sta)
sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band];
+ /* TODO: check for minstrel_s1g ? */
+ if (sband->band == NL80211_BAND_S1GHZ) {
+ ieee80211_s1g_sta_rate_init(sta);
+ rcu_read_unlock();
+ return;
+ }
+
spin_lock_bh(&sta->rate_ctrl_lock);
ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista,
priv_sta);
@@ -266,10 +273,15 @@ void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata)
if (WARN_ON(!sdata->vif.bss_conf.chandef.chan))
return;
+ band = sdata->vif.bss_conf.chandef.chan->band;
+ if (band == NL80211_BAND_S1GHZ) {
+ /* TODO */
+ return;
+ }
+
if (WARN_ON_ONCE(!basic_rates))
return;
- band = sdata->vif.bss_conf.chandef.chan->band;
user_mask = sdata->rc_rateidx_mask[band];
sband = local->hw.wiphy->bands[band];
@@ -296,21 +308,29 @@ static bool rc_no_data_or_no_ack_use_min(struct ieee80211_tx_rate_control *txrc)
!ieee80211_is_data(fc);
}
-static void rc_send_low_basicrate(s8 *idx, u32 basic_rates,
+static void rc_send_low_basicrate(struct ieee80211_tx_rate *rate,
+ u32 basic_rates,
struct ieee80211_supported_band *sband)
{
u8 i;
+ if (sband->band == NL80211_BAND_S1GHZ) {
+ /* TODO */
+ rate->flags |= IEEE80211_TX_RC_S1G_MCS;
+ rate->idx = 0;
+ return;
+ }
+
if (basic_rates == 0)
return; /* assume basic rates unknown and accept rate */
- if (*idx < 0)
+ if (rate->idx < 0)
return;
- if (basic_rates & (1 << *idx))
+ if (basic_rates & (1 << rate->idx))
return; /* selected rate is a basic rate */
- for (i = *idx + 1; i <= sband->n_bitrates; i++) {
+ for (i = rate->idx + 1; i <= sband->n_bitrates; i++) {
if (basic_rates & (1 << i)) {
- *idx = i;
+ rate->idx = i;
return;
}
}
@@ -328,6 +348,12 @@ static void __rate_control_send_low(struct ieee80211_hw *hw,
u32 rate_flags =
ieee80211_chandef_rate_flags(&hw->conf.chandef);
+ if (sband->band == NL80211_BAND_S1GHZ) {
+ info->control.rates[0].flags |= IEEE80211_TX_RC_S1G_MCS;
+ info->control.rates[0].idx = 0;
+ return;
+ }
+
if ((sband->band == NL80211_BAND_2GHZ) &&
(info->flags & IEEE80211_TX_CTL_NO_CCK_RATE))
rate_flags |= IEEE80211_RATE_ERP_G;
@@ -388,7 +414,7 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta,
}
if (use_basicrate)
- rc_send_low_basicrate(&info->control.rates[0].idx,
+ rc_send_low_basicrate(&info->control.rates[0],
txrc->bss_conf->basic_rates,
sband);
@@ -934,7 +960,8 @@ int rate_control_set_rates(struct ieee80211_hw *hw,
if (old)
kfree_rcu(old, rcu_head);
- drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta);
+ if (sta->uploaded)
+ drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta);
ieee80211_sta_set_expected_throughput(pubsta, sta_get_expected_throughput(sta));
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 86bc469a28bc..b13b1da19386 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -274,7 +274,7 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband,
success = !!(info->flags & IEEE80211_TX_STAT_ACK);
for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
- if (ar[i].idx < 0)
+ if (ar[i].idx < 0 || !ar[i].count)
break;
ndx = rix_to_ndx(mi, ar[i].idx);
@@ -287,12 +287,6 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband,
mi->r[ndx].stats.success += success;
}
- if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && (i >= 0))
- mi->sample_packets++;
-
- if (mi->sample_deferred > 0)
- mi->sample_deferred--;
-
if (time_after(jiffies, mi->last_stats_update +
mp->update_interval / (mp->new_avg ? 2 : 1)))
minstrel_update_stats(mp, mi);
@@ -367,7 +361,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
return;
delta = (mi->total_packets * sampling_ratio / 100) -
- (mi->sample_packets + mi->sample_deferred / 2);
+ mi->sample_packets;
/* delta < 0: no sampling required */
prev_sample = mi->prev_sample;
@@ -376,7 +370,6 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
return;
if (mi->total_packets >= 10000) {
- mi->sample_deferred = 0;
mi->sample_packets = 0;
mi->total_packets = 0;
} else if (delta > mi->n_rates * 2) {
@@ -401,19 +394,8 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
* rate sampling method should be used.
* Respect such rates that are not sampled for 20 interations.
*/
- if (mrr_capable &&
- msr->perfect_tx_time > mr->perfect_tx_time &&
- msr->stats.sample_skipped < 20) {
- /* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark
- * packets that have the sampling rate deferred to the
- * second MRR stage. Increase the sample counter only
- * if the deferred sample rate was actually used.
- * Use the sample_deferred counter to make sure that
- * the sampling is not done in large bursts */
- info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
- rate++;
- mi->sample_deferred++;
- } else {
+ if (msr->perfect_tx_time < mr->perfect_tx_time ||
+ msr->stats.sample_skipped >= 20) {
if (!msr->sample_limit)
return;
@@ -433,6 +415,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
rate->idx = mi->r[ndx].rix;
rate->count = minstrel_get_retry_count(&mi->r[ndx], info);
+ info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
}
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index dbb43bcd3c45..86cd80b3ffde 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -126,7 +126,6 @@ struct minstrel_sta_info {
u8 max_prob_rate;
unsigned int total_packets;
unsigned int sample_packets;
- int sample_deferred;
unsigned int sample_row;
unsigned int sample_column;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index a959ebf56852..972895e9f22d 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -32,61 +32,6 @@
#include "wme.h"
#include "rate.h"
-static inline void ieee80211_rx_stats(struct net_device *dev, u32 len)
-{
- struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
-
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += len;
- u64_stats_update_end(&tstats->syncp);
-}
-
-static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
- enum nl80211_iftype type)
-{
- __le16 fc = hdr->frame_control;
-
- if (ieee80211_is_data(fc)) {
- if (len < 24) /* drop incorrect hdr len (data) */
- return NULL;
-
- if (ieee80211_has_a4(fc))
- return NULL;
- if (ieee80211_has_tods(fc))
- return hdr->addr1;
- if (ieee80211_has_fromds(fc))
- return hdr->addr2;
-
- return hdr->addr3;
- }
-
- if (ieee80211_is_mgmt(fc)) {
- if (len < 24) /* drop incorrect hdr len (mgmt) */
- return NULL;
- return hdr->addr3;
- }
-
- if (ieee80211_is_ctl(fc)) {
- if (ieee80211_is_pspoll(fc))
- return hdr->addr1;
-
- if (ieee80211_is_back_req(fc)) {
- switch (type) {
- case NL80211_IFTYPE_STATION:
- return hdr->addr2;
- case NL80211_IFTYPE_AP:
- case NL80211_IFTYPE_AP_VLAN:
- return hdr->addr1;
- default:
- break; /* fall through to the return */
- }
- }
- }
-
- return NULL;
-}
-
/*
* monitor mode reception
*
@@ -887,7 +832,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
if (skb) {
skb->dev = sdata->dev;
- ieee80211_rx_stats(skb->dev, skb->len);
+ dev_sw_netstats_rx_add(skb->dev, skb->len);
netif_receive_skb(skb);
}
}
@@ -1522,7 +1467,6 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
if (unlikely((ieee80211_is_data(hdr->frame_control) ||
ieee80211_is_pspoll(hdr->frame_control)) &&
rx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
- rx->sdata->vif.type != NL80211_IFTYPE_WDS &&
rx->sdata->vif.type != NL80211_IFTYPE_OCB &&
(!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC)))) {
/*
@@ -1802,7 +1746,8 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
}
} else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) {
sta->rx_stats.last_rx = jiffies;
- } else if (!is_multicast_ether_addr(hdr->addr1)) {
+ } else if (!ieee80211_is_s1g_beacon(hdr->frame_control) &&
+ !is_multicast_ether_addr(hdr->addr1)) {
/*
* Mesh beacons will update last_rx when if they are found to
* match the current local configuration when processed.
@@ -1812,9 +1757,6 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
sta->rx_stats.last_rate = sta_stats_encode_rate(status);
}
- if (rx->sdata->vif.type == NL80211_IFTYPE_STATION)
- ieee80211_sta_rx_notify(rx->sdata, hdr);
-
sta->rx_stats.fragments++;
u64_stats_update_begin(&rx->sta->rx_stats.syncp);
@@ -1840,6 +1782,9 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
}
}
+ if (ieee80211_is_s1g_beacon(hdr->frame_control))
+ return RX_CONTINUE;
+
/*
* Change STA power saving mode only at the end of a frame
* exchange sequence, and only for a data or management
@@ -1950,6 +1895,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
__le16 fc;
const struct ieee80211_cipher_scheme *cs = NULL;
+ if (ieee80211_is_ext(hdr->frame_control))
+ return RX_CONTINUE;
+
/*
* Key selection 101
*
@@ -2258,7 +2206,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
hdr = (struct ieee80211_hdr *)rx->skb->data;
fc = hdr->frame_control;
- if (ieee80211_is_ctl(fc))
+ if (ieee80211_is_ctl(fc) || ieee80211_is_ext(fc))
return RX_CONTINUE;
sc = le16_to_cpu(hdr->seq_ctrl);
@@ -2601,7 +2549,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
skb = rx->skb;
xmit_skb = NULL;
- ieee80211_rx_stats(dev, skb->len);
+ dev_sw_netstats_rx_add(dev, skb->len);
if (rx->sta) {
/* The seqno index has the same property as needed
@@ -2900,7 +2848,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
fwd_hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_RETRY);
info = IEEE80211_SKB_CB(fwd_skb);
memset(info, 0, sizeof(*info));
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
info->control.vif = &rx->sdata->vif;
info->control.jiffies = jiffies;
if (is_multicast_ether_addr(fwd_hdr->addr1)) {
@@ -3132,6 +3080,9 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+ if (ieee80211_is_s1g_beacon(mgmt->frame_control))
+ return RX_CONTINUE;
+
/*
* From here on, look only at management frames.
* Data and control frames are already handled,
@@ -3599,6 +3550,27 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
}
static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_ext(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_hdr *hdr = (void *)rx->skb->data;
+
+ if (!ieee80211_is_ext(hdr->frame_control))
+ return RX_CONTINUE;
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ return RX_DROP_MONITOR;
+
+ /* for now only beacons are ext, so queue them */
+ skb_queue_tail(&sdata->skb_queue, rx->skb);
+ ieee80211_queue_work(&rx->local->hw, &sdata->work);
+ if (rx->sta)
+ rx->sta->rx_stats.packets++;
+
+ return RX_QUEUED;
+}
+
+static ieee80211_rx_result debug_noinline
ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
{
struct ieee80211_sub_if_data *sdata = rx->sdata;
@@ -3716,7 +3688,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
}
prev_dev = sdata->dev;
- ieee80211_rx_stats(sdata->dev, skb->len);
+ dev_sw_netstats_rx_add(sdata->dev, skb->len);
}
if (prev_dev) {
@@ -3817,6 +3789,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
CALL_RXH(ieee80211_rx_h_userspace_mgmt);
CALL_RXH(ieee80211_rx_h_action_post_userspace);
CALL_RXH(ieee80211_rx_h_action_return);
+ CALL_RXH(ieee80211_rx_h_ext);
CALL_RXH(ieee80211_rx_h_mgmt);
rxh_next:
@@ -3983,7 +3956,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
struct ieee80211_hdr *hdr = (void *)skb->data;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type);
- bool multicast = is_multicast_ether_addr(hdr->addr1);
+ bool multicast = is_multicast_ether_addr(hdr->addr1) ||
+ ieee80211_is_s1g_beacon(hdr->frame_control);
switch (sdata->vif.type) {
case NL80211_IFTYPE_STATION:
@@ -4095,10 +4069,6 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
return false;
return true;
- case NL80211_IFTYPE_WDS:
- if (bssid || !ieee80211_is_data(hdr->frame_control))
- return false;
- return ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2);
case NL80211_IFTYPE_P2P_DEVICE:
return ieee80211_is_public_action(hdr, skb->len) ||
ieee80211_is_probe_req(hdr->frame_control) ||
@@ -4149,7 +4119,6 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
fastrx.expected_ds_bits = 0;
} else {
- fastrx.sta_notify = sdata->u.mgd.probe_send_count > 0;
fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr3);
fastrx.expected_ds_bits =
@@ -4207,6 +4176,8 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
rcu_read_lock();
key = rcu_dereference(sta->ptk[sta->ptk_idx]);
+ if (!key)
+ key = rcu_dereference(sdata->default_unicast_key);
if (key) {
switch (key->conf.cipher) {
case WLAN_CIPHER_SUITE_TKIP:
@@ -4379,11 +4350,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
pskb_trim(skb, skb->len - fast_rx->icv_len))
goto drop;
- if (unlikely(fast_rx->sta_notify)) {
- ieee80211_sta_rx_notify(rx->sdata, hdr);
- fast_rx->sta_notify = false;
- }
-
/* statistics part of ieee80211_rx_h_sta_process() */
if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
stats->last_signal = status->signal;
@@ -4437,7 +4403,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
skb->dev = fast_rx->dev;
- ieee80211_rx_stats(fast_rx->dev, skb->len);
+ dev_sw_netstats_rx_add(fast_rx->dev, skb->len);
/* The seqno index has the same property as needed
* for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS
@@ -4587,7 +4553,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
ieee80211_verify_alignment(&rx);
if (unlikely(ieee80211_is_probe_resp(hdr->frame_control) ||
- ieee80211_is_beacon(hdr->frame_control)))
+ ieee80211_is_beacon(hdr->frame_control) ||
+ ieee80211_is_s1g_beacon(hdr->frame_control)))
ieee80211_scan_rx(local, skb);
if (ieee80211_is_data(fc)) {
@@ -4762,6 +4729,8 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
status->rx_flags = 0;
+ kcov_remote_start_common(skb_get_kcov_handle(skb));
+
/*
* Frames with failed FCS/PLCP checksum are not returned,
* all other frames are returned without radiotap header
@@ -4769,15 +4738,15 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
* Also, frames with less than 16 bytes are dropped.
*/
skb = ieee80211_rx_monitor(local, skb, rate);
- if (!skb)
- return;
-
- ieee80211_tpt_led_trig_rx(local,
- ((struct ieee80211_hdr *)skb->data)->frame_control,
- skb->len);
+ if (skb) {
+ ieee80211_tpt_led_trig_rx(local,
+ ((struct ieee80211_hdr *)skb->data)->frame_control,
+ skb->len);
- __ieee80211_rx_handle_packet(hw, pubsta, skb, list);
+ __ieee80211_rx_handle_packet(hw, pubsta, skb, list);
+ }
+ kcov_remote_stop();
return;
drop:
kfree_skb(skb);
diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c
new file mode 100644
index 000000000000..c33f332b049a
--- /dev/null
+++ b/net/mac80211/s1g.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * S1G handling
+ * Copyright(c) 2020 Adapt-IP
+ */
+#include <linux/ieee80211.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+
+void ieee80211_s1g_sta_rate_init(struct sta_info *sta)
+{
+ /* avoid indicating legacy bitrates for S1G STAs */
+ sta->tx_stats.last_rate.flags |= IEEE80211_TX_RC_S1G_MCS;
+ sta->rx_stats.last_rate =
+ STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_S1G);
+}
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 5ac2785cdc7b..d4cc9ac2d703 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -9,7 +9,7 @@
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2013-2015 Intel Mobile Communications GmbH
* Copyright 2016-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*/
#include <linux/if_arp.h>
@@ -146,7 +146,8 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
struct ieee80211_mgmt *mgmt, size_t len,
struct ieee80211_channel *channel)
{
- bool beacon = ieee80211_is_beacon(mgmt->frame_control);
+ bool beacon = ieee80211_is_beacon(mgmt->frame_control) ||
+ ieee80211_is_s1g_beacon(mgmt->frame_control);
struct cfg80211_bss *cbss, *non_tx_cbss;
struct ieee80211_bss *bss, *non_tx_bss;
struct cfg80211_inform_bss bss_meta = {
@@ -195,6 +196,11 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
elements = mgmt->u.probe_resp.variable;
baselen = offsetof(struct ieee80211_mgmt,
u.probe_resp.variable);
+ } else if (ieee80211_is_s1g_beacon(mgmt->frame_control)) {
+ struct ieee80211_ext *ext = (void *) mgmt;
+
+ baselen = offsetof(struct ieee80211_ext, u.s1g_beacon.variable);
+ elements = ext->u.s1g_beacon.variable;
} else {
baselen = offsetof(struct ieee80211_mgmt, u.beacon.variable);
elements = mgmt->u.beacon.variable;
@@ -246,9 +252,12 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb)
struct ieee80211_bss *bss;
struct ieee80211_channel *channel;
- if (skb->len < 24 ||
- (!ieee80211_is_probe_resp(mgmt->frame_control) &&
- !ieee80211_is_beacon(mgmt->frame_control)))
+ if (ieee80211_is_s1g_beacon(mgmt->frame_control)) {
+ if (skb->len < 15)
+ return;
+ } else if (skb->len < 24 ||
+ (!ieee80211_is_probe_resp(mgmt->frame_control) &&
+ !ieee80211_is_beacon(mgmt->frame_control)))
return;
sdata1 = rcu_dereference(local->scan_sdata);
@@ -712,6 +721,10 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
req->duration_mandatory;
local->hw_scan_band = 0;
+ local->hw_scan_req->req.n_6ghz_params = req->n_6ghz_params;
+ local->hw_scan_req->req.scan_6ghz_params =
+ req->scan_6ghz_params;
+ local->hw_scan_req->req.scan_6ghz = req->scan_6ghz;
/*
* After allocating local->hw_scan_req, we must
@@ -905,6 +918,17 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
local->scan_chandef.center_freq1 = chan->center_freq;
local->scan_chandef.freq1_offset = chan->freq_offset;
local->scan_chandef.center_freq2 = 0;
+
+ /* For scanning on the S1G band, ignore scan_width (which is constant
+ * across all channels) for now since channel width is specific to each
+ * channel. Detect the required channel width here and likely revisit
+ * later. Maybe scan_width could be used to build the channel scan list?
+ */
+ if (chan->band == NL80211_BAND_S1GHZ) {
+ local->scan_chandef.width = ieee80211_s1g_channel_width(chan);
+ goto set_channel;
+ }
+
switch (scan_req->scan_width) {
case NL80211_BSS_CHAN_WIDTH_5:
local->scan_chandef.width = NL80211_CHAN_WIDTH_5;
@@ -925,8 +949,14 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
else
local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
break;
+ case NL80211_BSS_CHAN_WIDTH_1:
+ case NL80211_BSS_CHAN_WIDTH_2:
+ /* shouldn't get here, S1G handled above */
+ WARN_ON(1);
+ break;
}
+set_channel:
if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL))
skip = 1;
@@ -1124,7 +1154,8 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
int max_n;
for (band = 0; band < NUM_NL80211_BANDS; band++) {
- if (!local->hw.wiphy->bands[band])
+ if (!local->hw.wiphy->bands[band] ||
+ band == NL80211_BAND_6GHZ)
continue;
max_n = local->hw.wiphy->bands[band]->n_channels;
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index ae1cb2c68722..76747bfdaddd 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -133,16 +133,20 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
}
if (wide_bw_chansw_ie) {
+ u8 new_seg1 = wide_bw_chansw_ie->new_center_freq_seg1;
struct ieee80211_vht_operation vht_oper = {
.chan_width =
wide_bw_chansw_ie->new_channel_width,
.center_freq_seg0_idx =
wide_bw_chansw_ie->new_center_freq_seg0,
- .center_freq_seg1_idx =
- wide_bw_chansw_ie->new_center_freq_seg1,
+ .center_freq_seg1_idx = new_seg1,
/* .basic_mcs_set doesn't matter */
};
- struct ieee80211_ht_operation ht_oper = {};
+ struct ieee80211_ht_operation ht_oper = {
+ .operation_mode =
+ cpu_to_le16(new_seg1 <<
+ IEEE80211_HT_OP_MODE_CCFS2_SHIFT),
+ };
/* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT,
* to the previously parsed chandef
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index f2840d1d95cf..ec6973ee88ef 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -258,6 +258,24 @@ struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
*/
void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
{
+ /*
+ * If we had used sta_info_pre_move_state() then we might not
+ * have gone through the state transitions down again, so do
+ * it here now (and warn if it's inserted).
+ *
+ * This will clear state such as fast TX/RX that may have been
+ * allocated during state transitions.
+ */
+ while (sta->sta_state > IEEE80211_STA_NONE) {
+ int ret;
+
+ WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED));
+
+ ret = sta_info_move_state(sta, sta->sta_state - 1);
+ if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret))
+ break;
+ }
+
if (sta->rate_ctrl)
rate_control_free_sta(sta);
@@ -687,7 +705,7 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
out_drop_sta:
local->num_sta--;
synchronize_net();
- __cleanup_single_sta(sta);
+ cleanup_single_sta(sta);
out_err:
mutex_unlock(&local->sta_mtx);
kfree(sinfo);
@@ -706,19 +724,13 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
err = sta_info_insert_check(sta);
if (err) {
+ sta_info_free(local, sta);
mutex_unlock(&local->sta_mtx);
rcu_read_lock();
- goto out_free;
+ return err;
}
- err = sta_info_insert_finish(sta);
- if (err)
- goto out_free;
-
- return 0;
- out_free:
- sta_info_free(local, sta);
- return err;
+ return sta_info_insert_finish(sta);
}
int sta_info_insert(struct sta_info *sta)
@@ -2122,6 +2134,10 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate,
int rate_idx = STA_STATS_GET(LEGACY_IDX, rate);
sband = local->hw.wiphy->bands[band];
+
+ if (WARN_ON_ONCE(!sband->bitrates))
+ break;
+
brate = sband->bitrates[rate_idx].bitrate;
if (rinfo->bw == RATE_INFO_BW_5)
shift = 2;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index d5010116cf4d..7afd07636b81 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -336,7 +336,6 @@ struct ieee80211_fast_tx {
* @expected_ds_bits: from/to DS bits expected
* @icv_len: length of the MIC if present
* @key: bool indicating encryption is expected (key is set)
- * @sta_notify: notify the MLME code (once)
* @internal_forward: forward froms internally on AP/VLAN type interfaces
* @uses_rss: copy of USES_RSS hw flag
* @da_offs: offset of the DA in the header (for header conversion)
@@ -352,7 +351,6 @@ struct ieee80211_fast_rx {
__le16 expected_ds_bits;
u8 icv_len;
u8 key:1,
- sta_notify:1,
internal_forward:1,
uses_rss:1;
u8 da_offs, sa_offs;
@@ -787,7 +785,7 @@ int sta_info_init(struct ieee80211_local *local);
void sta_info_stop(struct ieee80211_local *local);
/**
- * sta_info_flush - flush matching STA entries from the STA table
+ * __sta_info_flush - flush matching STA entries from the STA table
*
* Returns the number of removed STA entries.
*
@@ -796,6 +794,13 @@ void sta_info_stop(struct ieee80211_local *local);
*/
int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans);
+/**
+ * sta_info_flush - flush matching STA entries from the STA table
+ *
+ * Returns the number of removed STA entries.
+ *
+ * @sdata: sdata to remove all stations from
+ */
static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata)
{
return __sta_info_flush(sdata, false);
@@ -825,6 +830,7 @@ enum sta_stats_type {
STA_STATS_RATE_TYPE_HT,
STA_STATS_RATE_TYPE_VHT,
STA_STATS_RATE_TYPE_HE,
+ STA_STATS_RATE_TYPE_S1G,
};
#define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0)
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 0794396a7988..3485610755ef 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -49,7 +49,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
int ac;
if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER |
- IEEE80211_TX_CTL_AMPDU)) {
+ IEEE80211_TX_CTL_AMPDU |
+ IEEE80211_TX_CTL_HW_80211_ENCAP)) {
ieee80211_free_txskb(&local->hw, skb);
return;
}
@@ -66,8 +67,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
info->control.jiffies = jiffies;
info->control.vif = &sta->sdata->vif;
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING |
- IEEE80211_TX_INTFL_RETRANSMISSION;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
+ info->flags |= IEEE80211_TX_INTFL_RETRANSMISSION;
info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
sta->status_stats.filtered++;
@@ -184,18 +185,6 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb)
struct ieee80211_mgmt *mgmt = (void *) skb->data;
struct ieee80211_local *local = sta->local;
struct ieee80211_sub_if_data *sdata = sta->sdata;
- struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb);
-
- if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
- sta->status_stats.last_ack = jiffies;
- if (txinfo->status.is_valid_ack_signal) {
- sta->status_stats.last_ack_signal =
- (s8)txinfo->status.ack_signal;
- sta->status_stats.ack_signal_filled = true;
- ewma_avg_signal_add(&sta->status_stats.avg_ack_signal,
- -txinfo->status.ack_signal);
- }
- }
if (ieee80211_is_data_qos(mgmt->frame_control)) {
struct ieee80211_hdr *hdr = (void *) skb->data;
@@ -890,7 +879,8 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
}
static void __ieee80211_tx_status(struct ieee80211_hw *hw,
- struct ieee80211_tx_status *status)
+ struct ieee80211_tx_status *status,
+ int rates_idx, int retry_count)
{
struct sk_buff *skb = status->skb;
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
@@ -899,17 +889,12 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
struct sta_info *sta;
__le16 fc;
struct ieee80211_supported_band *sband;
- int retry_count;
- int rates_idx;
bool send_to_cooked;
bool acked;
bool noack_success;
struct ieee80211_bar *bar;
int shift = 0;
int tid = IEEE80211_NUM_TIDS;
- u16 tx_time_est;
-
- rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
sband = local->hw.wiphy->bands[info->band];
fc = hdr->frame_control;
@@ -931,15 +916,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
ieee80211_mpsp_trigger_process(
ieee80211_get_qos_ctl(hdr), sta, true, acked);
- if (!acked && test_sta_flag(sta, WLAN_STA_PS_STA)) {
- /*
- * The STA is in power save mode, so assume
- * that this TX packet failed because of that.
- */
- ieee80211_handle_filtered_frame(local, sta, skb);
- return;
- }
-
if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) &&
(ieee80211_is_data(hdr->frame_control)) &&
(rates_idx != -1))
@@ -987,62 +963,17 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) {
ieee80211_handle_filtered_frame(local, sta, skb);
return;
- } else {
+ } else if (ieee80211_is_data_present(fc)) {
if (!acked && !noack_success)
- sta->status_stats.retry_failed++;
- sta->status_stats.retry_count += retry_count;
+ sta->status_stats.msdu_failed[tid]++;
- if (ieee80211_is_data_present(fc)) {
- if (!acked && !noack_success)
- sta->status_stats.msdu_failed[tid]++;
-
- sta->status_stats.msdu_retries[tid] +=
- retry_count;
- }
+ sta->status_stats.msdu_retries[tid] +=
+ retry_count;
}
- rate_control_tx_status(local, sband, status);
- if (ieee80211_vif_is_mesh(&sta->sdata->vif))
- ieee80211s_update_metric(local, sta, status);
-
if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked)
ieee80211_frame_acked(sta, skb);
- if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) &&
- ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
- ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data,
- acked, info->status.tx_time);
-
- if (info->status.tx_time &&
- wiphy_ext_feature_isset(local->hw.wiphy,
- NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
- ieee80211_sta_register_airtime(&sta->sta, tid,
- info->status.tx_time, 0);
-
- if ((tx_time_est = ieee80211_info_get_tx_time_est(info)) > 0) {
- /* Do this here to avoid the expensive lookup of the sta
- * in ieee80211_report_used_skb().
- */
- ieee80211_sta_update_pending_airtime(local, sta,
- skb_get_queue_mapping(skb),
- tx_time_est,
- true);
- ieee80211_info_set_tx_time_est(info, 0);
- }
-
- if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
- if (acked) {
- if (sta->status_stats.lost_packets)
- sta->status_stats.lost_packets = 0;
-
- /* Track when last TDLS packet was ACKed */
- sta->status_stats.last_pkt_time = jiffies;
- } else if (noack_success) {
- /* nothing to do here, do not account as lost */
- } else {
- ieee80211_lost_packet(sta, info);
- }
- }
}
/* SNMP counters
@@ -1101,7 +1032,10 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
* with this test...
*/
if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) {
- dev_kfree_skb(skb);
+ if (status->free_list)
+ list_add_tail(&skb->list, status->free_list);
+ else
+ dev_kfree_skb(skb);
return;
}
@@ -1126,7 +1060,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
if (sta)
status.sta = &sta->sta;
- __ieee80211_tx_status(hw, &status);
+ ieee80211_tx_status_ext(hw, &status);
rcu_read_unlock();
}
EXPORT_SYMBOL(ieee80211_tx_status);
@@ -1137,10 +1071,12 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_tx_info *info = status->info;
struct ieee80211_sta *pubsta = status->sta;
+ struct sk_buff *skb = status->skb;
struct ieee80211_supported_band *sband;
- struct sta_info *sta;
- int retry_count;
+ struct sta_info *sta = NULL;
+ int rates_idx, retry_count;
bool acked, noack_success;
+ u16 tx_time_est;
if (pubsta) {
sta = container_of(pubsta, struct sta_info, sta);
@@ -1149,13 +1085,22 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
sta->tx_stats.last_rate_info = *status->rate;
}
- if (status->skb)
- return __ieee80211_tx_status(hw, status);
+ if (skb && (tx_time_est =
+ ieee80211_info_get_tx_time_est(IEEE80211_SKB_CB(skb))) > 0) {
+ /* Do this here to avoid the expensive lookup of the sta
+ * in ieee80211_report_used_skb().
+ */
+ ieee80211_sta_update_pending_airtime(local, sta,
+ skb_get_queue_mapping(skb),
+ tx_time_est,
+ true);
+ ieee80211_info_set_tx_time_est(IEEE80211_SKB_CB(skb), 0);
+ }
- if (!status->sta)
- return;
+ if (!status->info)
+ goto free;
- ieee80211_tx_get_rates(hw, info, &retry_count);
+ rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
sband = hw->wiphy->bands[info->band];
@@ -1163,24 +1108,52 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED);
if (pubsta) {
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+
if (!acked && !noack_success)
sta->status_stats.retry_failed++;
sta->status_stats.retry_count += retry_count;
- if (acked) {
- sta->status_stats.last_ack = jiffies;
+ if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP))
+ ieee80211_sta_tx_notify(sdata, (void *) skb->data,
+ acked, info->status.tx_time);
- if (sta->status_stats.lost_packets)
- sta->status_stats.lost_packets = 0;
+ if (acked) {
+ sta->status_stats.last_ack = jiffies;
- /* Track when last packet was ACKed */
- sta->status_stats.last_pkt_time = jiffies;
- } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
- return;
- } else if (noack_success) {
- /* nothing to do here, do not account as lost */
- } else {
- ieee80211_lost_packet(sta, info);
+ if (sta->status_stats.lost_packets)
+ sta->status_stats.lost_packets = 0;
+
+ /* Track when last packet was ACKed */
+ sta->status_stats.last_pkt_time = jiffies;
+
+ /* Reset connection monitor */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ unlikely(sdata->u.mgd.probe_send_count > 0))
+ sdata->u.mgd.probe_send_count = 0;
+
+ if (info->status.is_valid_ack_signal) {
+ sta->status_stats.last_ack_signal =
+ (s8)info->status.ack_signal;
+ sta->status_stats.ack_signal_filled = true;
+ ewma_avg_signal_add(&sta->status_stats.avg_ack_signal,
+ -info->status.ack_signal);
+ }
+ } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
+ /*
+ * The STA is in power save mode, so assume
+ * that this TX packet failed because of that.
+ */
+ if (skb)
+ ieee80211_handle_filtered_frame(local, sta, skb);
+ return;
+ } else if (noack_success) {
+ /* nothing to do here, do not account as lost */
+ } else {
+ ieee80211_lost_packet(sta, info);
+ }
}
rate_control_tx_status(local, sband, status);
@@ -1188,6 +1161,10 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
ieee80211s_update_metric(local, sta, status);
}
+ if (skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP))
+ return __ieee80211_tx_status(hw, status, rates_idx,
+ retry_count);
+
if (acked || noack_success) {
I802_DEBUG_INC(local->dot11TransmittedFrameCount);
if (!pubsta)
@@ -1199,6 +1176,16 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
} else {
I802_DEBUG_INC(local->dot11FailedCount);
}
+
+free:
+ if (!skb)
+ return;
+
+ ieee80211_report_used_skb(local, skb, false);
+ if (status->free_list)
+ list_add_tail(&skb->list, status->free_list);
+ else
+ dev_kfree_skb(skb);
}
EXPORT_SYMBOL(ieee80211_tx_status_ext);
@@ -1225,69 +1212,23 @@ void ieee80211_tx_status_8023(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
struct sk_buff *skb)
{
- struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_sub_if_data *sdata;
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_status status = {
+ .skb = skb,
+ .info = IEEE80211_SKB_CB(skb),
+ };
struct sta_info *sta;
- int retry_count;
- int rates_idx;
- bool acked;
sdata = vif_to_sdata(vif);
- acked = info->flags & IEEE80211_TX_STAT_ACK;
- rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
-
rcu_read_lock();
- if (ieee80211_lookup_ra_sta(sdata, skb, &sta))
- goto counters_update;
-
- if (IS_ERR(sta))
- goto counters_update;
-
- if (!acked)
- sta->status_stats.retry_failed++;
-
- if (rates_idx != -1)
- sta->tx_stats.last_rate = info->status.rates[rates_idx];
-
- sta->status_stats.retry_count += retry_count;
-
- if (ieee80211_hw_check(hw, REPORTS_TX_ACK_STATUS)) {
- if (acked && vif->type == NL80211_IFTYPE_STATION)
- ieee80211_sta_reset_conn_monitor(sdata);
-
- sta->status_stats.last_ack = jiffies;
- if (info->flags & IEEE80211_TX_STAT_ACK) {
- if (sta->status_stats.lost_packets)
- sta->status_stats.lost_packets = 0;
+ if (!ieee80211_lookup_ra_sta(sdata, skb, &sta) && !IS_ERR(sta))
+ status.sta = &sta->sta;
- sta->status_stats.last_pkt_time = jiffies;
- } else {
- ieee80211_lost_packet(sta, info);
- }
- }
+ ieee80211_tx_status_ext(hw, &status);
-counters_update:
rcu_read_unlock();
- ieee80211_led_tx(local);
-
- if (!(info->flags & IEEE80211_TX_STAT_ACK) &&
- !(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED))
- goto skip_stats_update;
-
- I802_DEBUG_INC(local->dot11TransmittedFrameCount);
- if (is_multicast_ether_addr(skb->data))
- I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount);
- if (retry_count > 0)
- I802_DEBUG_INC(local->dot11RetryCount);
- if (retry_count > 1)
- I802_DEBUG_INC(local->dot11MultipleRetryCount);
-
-skip_stats_update:
- ieee80211_report_used_skb(local, skb, false);
- dev_kfree_skb(skb);
}
EXPORT_SYMBOL(ieee80211_tx_status_8023);
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 50ab5b9d8eab..601322e16957 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2,7 +2,7 @@
/*
* Portions of this file
* Copyright(c) 2016-2017 Intel Deutschland GmbH
-* Copyright (C) 2018 - 2019 Intel Corporation
+* Copyright (C) 2018 - 2020 Intel Corporation
*/
#if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
@@ -2086,6 +2086,27 @@ TRACE_EVENT(api_connection_loss,
)
);
+TRACE_EVENT(api_disconnect,
+ TP_PROTO(struct ieee80211_sub_if_data *sdata, bool reconnect),
+
+ TP_ARGS(sdata, reconnect),
+
+ TP_STRUCT__entry(
+ VIF_ENTRY
+ __field(int, reconnect)
+ ),
+
+ TP_fast_assign(
+ VIF_ASSIGN;
+ __entry->reconnect = reconnect;
+ ),
+
+ TP_printk(
+ VIF_PR_FMT " reconnect:%d",
+ VIF_PR_ARG, __entry->reconnect
+ )
+);
+
TRACE_EVENT(api_cqm_rssi_notify,
TP_PROTO(struct ieee80211_sub_if_data *sdata,
enum nl80211_cqm_rssi_threshold_event rssi_event,
@@ -2734,6 +2755,39 @@ TRACE_EVENT(drv_get_ftm_responder_stats,
)
);
+DEFINE_EVENT(local_sdata_addr_evt, drv_update_vif_offload,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_sta_set_4addr,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta, bool enabled),
+
+ TP_ARGS(local, sdata, sta, enabled),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(bool, enabled)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ __entry->enabled = enabled;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " enabled:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->enabled
+ )
+);
+
#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index dca01d7e6e3e..ebb3228ce971 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,16 +38,6 @@
/* misc utils */
-static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
-{
- struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
-
- u64_stats_update_begin(&tstats->syncp);
- tstats->tx_packets++;
- tstats->tx_bytes += len;
- u64_stats_update_end(&tstats->syncp);
-}
-
static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
struct sk_buff *skb, int group_addr,
int next_frag_len)
@@ -82,6 +72,10 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
erp = txrate->flags & IEEE80211_RATE_ERP_G;
+ /* device is expected to do this */
+ if (sband->band == NL80211_BAND_S1GHZ)
+ return 0;
+
/*
* data and mgmt (except PS Poll):
* - during CFP: 32768
@@ -315,9 +309,6 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
if (tx->sdata->vif.type == NL80211_IFTYPE_OCB)
return TX_CONTINUE;
- if (tx->sdata->vif.type == NL80211_IFTYPE_WDS)
- return TX_CONTINUE;
-
if (tx->flags & IEEE80211_TX_PS_BUFFERED)
return TX_CONTINUE;
@@ -531,7 +522,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
info->control.jiffies = jiffies;
info->control.vif = &tx->sdata->vif;
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
skb_queue_tail(&sta->ps_tx_buf[ac], tx->skb);
spin_unlock(&sta->ps_lock);
@@ -658,7 +649,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
if (!skip_hw && tx->key &&
tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)
info->control.hw_key = &tx->key->conf;
- } else if (!ieee80211_is_mgmt(hdr->frame_control) && tx->sta &&
+ } else if (ieee80211_is_data_present(hdr->frame_control) && tx->sta &&
test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) {
return TX_DROP;
}
@@ -1134,7 +1125,7 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
tx->sta->sta.addr, tx->sta->sta.aid);
}
info->control.vif = &tx->sdata->vif;
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
__skb_queue_tail(&tid_tx->pending, skb);
if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER)
@@ -1179,7 +1170,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
* we are doing the needed processing, so remove the flag
* now.
*/
- info->flags &= ~IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags &= ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
hdr = (struct ieee80211_hdr *) skb->data;
@@ -1258,7 +1249,7 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
(info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
return NULL;
- if (!(info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) &&
+ if (!(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) &&
unlikely(!ieee80211_is_data_present(hdr->frame_control))) {
if ((!ieee80211_is_mgmt(hdr->frame_control) ||
ieee80211_is_bufferable_mmpdu(hdr->frame_control) ||
@@ -1938,19 +1929,24 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
/* device xmit handlers */
+enum ieee80211_encrypt {
+ ENCRYPT_NO,
+ ENCRYPT_MGMT,
+ ENCRYPT_DATA,
+};
+
static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb,
- int head_need, bool may_encrypt)
+ int head_need,
+ enum ieee80211_encrypt encrypt)
{
struct ieee80211_local *local = sdata->local;
- struct ieee80211_hdr *hdr;
bool enc_tailroom;
int tail_need = 0;
- hdr = (struct ieee80211_hdr *) skb->data;
- enc_tailroom = may_encrypt &&
- (sdata->crypto_tx_tailroom_needed_cnt ||
- ieee80211_is_mgmt(hdr->frame_control));
+ enc_tailroom = encrypt == ENCRYPT_MGMT ||
+ (encrypt == ENCRYPT_DATA &&
+ sdata->crypto_tx_tailroom_needed_cnt);
if (enc_tailroom) {
tail_need = IEEE80211_ENCRYPT_TAILROOM;
@@ -1981,23 +1977,29 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
- struct ieee80211_hdr *hdr;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
int headroom;
- bool may_encrypt;
+ enum ieee80211_encrypt encrypt;
- may_encrypt = !(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT);
+ if (info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)
+ encrypt = ENCRYPT_NO;
+ else if (ieee80211_is_mgmt(hdr->frame_control))
+ encrypt = ENCRYPT_MGMT;
+ else
+ encrypt = ENCRYPT_DATA;
headroom = local->tx_headroom;
- if (may_encrypt)
+ if (encrypt != ENCRYPT_NO)
headroom += sdata->encrypt_headroom;
headroom -= skb_headroom(skb);
headroom = max_t(int, 0, headroom);
- if (ieee80211_skb_resize(sdata, skb, headroom, may_encrypt)) {
+ if (ieee80211_skb_resize(sdata, skb, headroom, encrypt)) {
ieee80211_free_txskb(&local->hw, skb);
return;
}
+ /* reload after potential resize */
hdr = (struct ieee80211_hdr *) skb->data;
info->control.vif = &sdata->vif;
@@ -2098,6 +2100,9 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb,
info->flags |= IEEE80211_TX_CTL_NO_ACK;
if (txflags & IEEE80211_RADIOTAP_F_TX_NOSEQNO)
info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO;
+ if (txflags & IEEE80211_RADIOTAP_F_TX_ORDER)
+ info->control.flags |=
+ IEEE80211_TX_CTRL_DONT_REORDER;
break;
case IEEE80211_RADIOTAP_RATE:
@@ -2264,11 +2269,13 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
payload[7]);
}
- /*
- * Initialize skb->priority for QoS frames. This is put in the TID field
- * of the frame before passing it to the driver.
+ /* Initialize skb->priority for QoS frames. If the DONT_REORDER flag
+ * is set, stick to the default value for skb->priority to assure
+ * frames injected with this flag are not reordered relative to each
+ * other.
*/
- if (ieee80211_is_data_qos(hdr->frame_control)) {
+ if (ieee80211_is_data_qos(hdr->frame_control) &&
+ !(info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER)) {
u8 *p = ieee80211_get_qos_ctl(hdr);
skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK;
}
@@ -2280,8 +2287,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
* we handle as though they are non-injected frames.
* This code here isn't entirely correct, the local MAC address
* isn't always enough to find the interface to use; for proper
- * VLAN/WDS support we will need a different mechanism (which
- * likely isn't going to be monitor interfaces).
+ * VLAN support we have an nl80211-based mechanism.
*
* This is necessary, for example, for old hostapd versions that
* don't use nl80211-based management TX/RX.
@@ -2292,8 +2298,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
if (!ieee80211_sdata_running(tmp_sdata))
continue;
if (tmp_sdata->vif.type == NL80211_IFTYPE_MONITOR ||
- tmp_sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- tmp_sdata->vif.type == NL80211_IFTYPE_WDS)
+ tmp_sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
continue;
if (ether_addr_equal(tmp_sdata->vif.addr, hdr->addr2)) {
sdata = tmp_sdata;
@@ -2387,9 +2392,6 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
}
sta = sta_info_get_bss(sdata, skb->data);
break;
- case NL80211_IFTYPE_WDS:
- sta = sta_info_get(sdata, sdata->u.wds.remote_addr);
- break;
#ifdef CONFIG_MAC80211_MESH
case NL80211_IFTYPE_MESH_POINT:
/* determined much later */
@@ -2473,7 +2475,9 @@ static u16 ieee80211_store_ack_skb(struct ieee80211_local *local,
* @sdata: virtual interface to build the header for
* @skb: the skb to build the header in
* @info_flags: skb flags to set
+ * @sta: the station pointer
* @ctrl_flags: info control flags to set
+ * @cookie: cookie pointer to fill (if not %NULL)
*
* This function takes the skb with 802.3 header and reformats the header to
* the appropriate IEEE 802.11 header based on which interface the packet is
@@ -2563,20 +2567,6 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
hdrlen = 24;
band = chanctx_conf->def.chan->band;
break;
- case NL80211_IFTYPE_WDS:
- fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
- /* RA TA DA SA */
- memcpy(hdr.addr1, sdata->u.wds.remote_addr, ETH_ALEN);
- memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
- memcpy(hdr.addr3, skb->data, ETH_ALEN);
- memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN);
- hdrlen = 30;
- /*
- * This is the exception! WDS style interfaces are prohibited
- * when channel contexts are in used so this must be valid
- */
- band = local->hw.conf.chandef.chan->band;
- break;
#ifdef CONFIG_MAC80211_MESH
case NL80211_IFTYPE_MESH_POINT:
if (!is_multicast_ether_addr(skb->data)) {
@@ -2822,7 +2812,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
head_need += sdata->encrypt_headroom;
head_need += local->tx_headroom;
head_need = max_t(int, 0, head_need);
- if (ieee80211_skb_resize(sdata, skb, head_need, true)) {
+ if (ieee80211_skb_resize(sdata, skb, head_need, ENCRYPT_DATA)) {
ieee80211_free_txskb(&local->hw, skb);
skb = NULL;
return ERR_PTR(-ENOMEM);
@@ -3386,7 +3376,7 @@ static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
if (key)
info->control.hw_key = &key->conf;
- ieee80211_tx_stats(skb->dev, skb->len);
+ dev_sw_netstats_tx_add(skb->dev, 1, skb->len);
if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
@@ -3496,7 +3486,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
if (unlikely(ieee80211_skb_resize(sdata, skb,
max_t(int, extra_head + hw_headroom -
skb_headroom(skb), 0),
- false))) {
+ ENCRYPT_NO))) {
kfree_skb(skb);
return true;
}
@@ -3613,13 +3603,14 @@ begin:
tx.skb = skb;
tx.sdata = vif_to_sdata(info->control.vif);
- if (txq->sta && !(info->flags & IEEE80211_TX_CTL_INJECTED)) {
+ if (txq->sta) {
tx.sta = container_of(txq->sta, struct sta_info, sta);
/*
* Drop unicast frames to unauthorised stations unless they are
- * EAPOL frames from the local station.
+ * injected frames or EAPOL frames from the local station.
*/
- if (unlikely(ieee80211_is_data(hdr->frame_control) &&
+ if (unlikely(!(info->flags & IEEE80211_TX_CTL_INJECTED) &&
+ ieee80211_is_data(hdr->frame_control) &&
!ieee80211_vif_is_mesh(&tx.sdata->vif) &&
tx.sdata->vif.type != NL80211_IFTYPE_OCB &&
!is_multicast_ether_addr(hdr->addr1) &&
@@ -3649,7 +3640,7 @@ begin:
else
info->flags &= ~IEEE80211_TX_CTL_AMPDU;
- if (info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP)
+ if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)
goto encap_out;
if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
@@ -3818,7 +3809,7 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw,
* get immediately moved to the back of the list on the next
* call to ieee80211_next_txq().
*/
- if (txqi->txq.sta &&
+ if (txqi->txq.sta && local->airtime_flags &&
wiphy_ext_feature_isset(local->hw.wiphy,
NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
list_add(&txqi->schedule_order,
@@ -4003,7 +3994,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
goto out;
}
- ieee80211_tx_stats(dev, skb->len);
+ dev_sw_netstats_tx_add(dev, 1, skb->len);
ieee80211_xmit(sdata, sta, skb);
}
@@ -4190,38 +4181,18 @@ static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata,
static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
struct net_device *dev, struct sta_info *sta,
- struct sk_buff *skb)
+ struct ieee80211_key *key, struct sk_buff *skb)
{
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
- struct ethhdr *ehdr = (struct ethhdr *)skb->data;
struct ieee80211_local *local = sdata->local;
- bool authorized = false;
- bool multicast;
- unsigned char *ra = ehdr->h_dest;
-
- if (IS_ERR(sta) || (sta && !sta->uploaded))
- sta = NULL;
-
- if (sdata->vif.type == NL80211_IFTYPE_STATION &&
- (!sta || !test_sta_flag(sta, WLAN_STA_TDLS_PEER)))
- ra = sdata->u.mgd.bssid;
-
- if (is_zero_ether_addr(ra))
- goto out_free;
+ struct tid_ampdu_tx *tid_tx;
+ u8 tid;
- multicast = is_multicast_ether_addr(ra);
-
- if (sta)
- authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED);
-
- if (!multicast && !authorized &&
- (ehdr->h_proto != sdata->control_port_protocol ||
- !ether_addr_equal(sdata->vif.addr, ehdr->h_source)))
- goto out_free;
-
- if (multicast && sdata->vif.type == NL80211_IFTYPE_AP &&
- !atomic_read(&sdata->u.ap.num_mcast_sta))
- goto out_free;
+ if (local->ops->wake_tx_queue) {
+ u16 queue = __ieee80211_select_queue(sdata, sta, skb);
+ skb_set_queue_mapping(skb, queue);
+ skb_get_hash(skb);
+ }
if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) &&
test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))
@@ -4229,36 +4200,42 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
memset(info, 0, sizeof(*info));
- if (unlikely(!multicast && skb->sk &&
- skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
- info->ack_frame_id = ieee80211_store_ack_skb(local, skb,
- &info->flags, NULL);
+ tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+ tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+ if (tid_tx) {
+ if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) {
+ /* fall back to non-offload slow path */
+ __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL);
+ return;
+ }
- if (unlikely(sdata->control_port_protocol == ehdr->h_proto)) {
- if (sdata->control_port_no_encrypt)
- info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
- info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO;
+ info->flags |= IEEE80211_TX_CTL_AMPDU;
+ if (tid_tx->timeout)
+ tid_tx->last_tx = jiffies;
}
- if (multicast)
- info->flags |= IEEE80211_TX_CTL_NO_ACK;
+ if (unlikely(skb->sk &&
+ skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
+ info->ack_frame_id = ieee80211_store_ack_skb(local, skb,
+ &info->flags, NULL);
info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
- ieee80211_tx_stats(dev, skb->len);
+ dev_sw_netstats_tx_add(dev, 1, skb->len);
- if (sta) {
- sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
- sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
- }
+ sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
+ sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
sdata = container_of(sdata->bss,
struct ieee80211_sub_if_data, u.ap);
- info->control.flags |= IEEE80211_TX_CTRL_HW_80211_ENCAP;
+ info->flags |= IEEE80211_TX_CTL_HW_80211_ENCAP;
info->control.vif = &sdata->vif;
+ if (key)
+ info->control.hw_key = &key->conf;
+
ieee80211_tx_8023(sdata, skb, skb->len, sta, false);
return;
@@ -4271,13 +4248,10 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
struct net_device *dev)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ethhdr *ehdr = (struct ethhdr *)skb->data;
+ struct ieee80211_key *key;
struct sta_info *sta;
- if (WARN_ON(!sdata->hw_80211_encap)) {
- kfree_skb(skb);
- return NETDEV_TX_OK;
- }
-
if (unlikely(skb->len < ETH_HLEN)) {
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -4285,11 +4259,30 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
rcu_read_lock();
- if (ieee80211_lookup_ra_sta(sdata, skb, &sta))
+ if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) {
kfree_skb(skb);
- else
- ieee80211_8023_xmit(sdata, dev, sta, skb);
+ goto out;
+ }
+
+ if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded ||
+ !test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
+ sdata->control_port_protocol == ehdr->h_proto))
+ goto skip_offload;
+
+ key = rcu_dereference(sta->ptk[sta->ptk_idx]);
+ if (!key)
+ key = rcu_dereference(sdata->default_unicast_key);
+ if (key && (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) ||
+ key->conf.cipher == WLAN_CIPHER_SUITE_TKIP))
+ goto skip_offload;
+
+ ieee80211_8023_xmit(sdata, dev, sta, key, skb);
+ goto out;
+
+skip_offload:
+ ieee80211_subif_start_xmit(skb, dev);
+out:
rcu_read_unlock();
return NETDEV_TX_OK;
@@ -4365,7 +4358,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
sdata = vif_to_sdata(info->control.vif);
- if (info->flags & IEEE80211_TX_INTFL_NEED_TXPROCESSING) {
+ if (info->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING) {
chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
if (unlikely(!chanctx_conf)) {
dev_kfree_skb(skb);
@@ -4373,7 +4366,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
}
info->band = chanctx_conf->def.chan->band;
result = ieee80211_tx(sdata, NULL, skb, true);
- } else if (info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) {
+ } else if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) {
if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) {
dev_kfree_skb(skb);
return true;
@@ -4401,9 +4394,10 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
/*
* Transmit all pending packets. Called from tasklet.
*/
-void ieee80211_tx_pending(unsigned long data)
+void ieee80211_tx_pending(struct tasklet_struct *t)
{
- struct ieee80211_local *local = (struct ieee80211_local *)data;
+ struct ieee80211_local *local = from_tasklet(local, t,
+ tx_pending_tasklet);
unsigned long flags;
int i;
bool txok;
@@ -4538,14 +4532,14 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
return 0;
}
-static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata,
- struct beacon_data *beacon)
+static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata,
+ struct beacon_data *beacon)
{
struct probe_resp *resp;
u8 *beacon_data;
size_t beacon_data_len;
int i;
- u8 count = beacon->csa_current_counter;
+ u8 count = beacon->cntdwn_current_counter;
switch (sdata->vif.type) {
case NL80211_IFTYPE_AP:
@@ -4565,36 +4559,36 @@ static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata,
}
rcu_read_lock();
- for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; ++i) {
+ for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; ++i) {
resp = rcu_dereference(sdata->u.ap.probe_resp);
- if (beacon->csa_counter_offsets[i]) {
- if (WARN_ON_ONCE(beacon->csa_counter_offsets[i] >=
+ if (beacon->cntdwn_counter_offsets[i]) {
+ if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[i] >=
beacon_data_len)) {
rcu_read_unlock();
return;
}
- beacon_data[beacon->csa_counter_offsets[i]] = count;
+ beacon_data[beacon->cntdwn_counter_offsets[i]] = count;
}
if (sdata->vif.type == NL80211_IFTYPE_AP && resp)
- resp->data[resp->csa_counter_offsets[i]] = count;
+ resp->data[resp->cntdwn_counter_offsets[i]] = count;
}
rcu_read_unlock();
}
-static u8 __ieee80211_csa_update_counter(struct beacon_data *beacon)
+static u8 __ieee80211_beacon_update_cntdwn(struct beacon_data *beacon)
{
- beacon->csa_current_counter--;
+ beacon->cntdwn_current_counter--;
/* the counter should never reach 0 */
- WARN_ON_ONCE(!beacon->csa_current_counter);
+ WARN_ON_ONCE(!beacon->cntdwn_current_counter);
- return beacon->csa_current_counter;
+ return beacon->cntdwn_current_counter;
}
-u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif)
+u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct beacon_data *beacon = NULL;
@@ -4612,15 +4606,15 @@ u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif)
if (!beacon)
goto unlock;
- count = __ieee80211_csa_update_counter(beacon);
+ count = __ieee80211_beacon_update_cntdwn(beacon);
unlock:
rcu_read_unlock();
return count;
}
-EXPORT_SYMBOL(ieee80211_csa_update_counter);
+EXPORT_SYMBOL(ieee80211_beacon_update_cntdwn);
-void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter)
+void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct beacon_data *beacon = NULL;
@@ -4637,15 +4631,15 @@ void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter)
if (!beacon)
goto unlock;
- if (counter < beacon->csa_current_counter)
- beacon->csa_current_counter = counter;
+ if (counter < beacon->cntdwn_current_counter)
+ beacon->cntdwn_current_counter = counter;
unlock:
rcu_read_unlock();
}
-EXPORT_SYMBOL(ieee80211_csa_set_counter);
+EXPORT_SYMBOL(ieee80211_beacon_set_cntdwn);
-bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
+bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct beacon_data *beacon = NULL;
@@ -4688,20 +4682,21 @@ bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
goto out;
}
- if (!beacon->csa_counter_offsets[0])
+ if (!beacon->cntdwn_counter_offsets[0])
goto out;
- if (WARN_ON_ONCE(beacon->csa_counter_offsets[0] > beacon_data_len))
+ if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[0] > beacon_data_len))
goto out;
- if (beacon_data[beacon->csa_counter_offsets[0]] == 1)
+ if (beacon_data[beacon->cntdwn_counter_offsets[0]] == 1)
ret = true;
+
out:
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL(ieee80211_csa_is_complete);
+EXPORT_SYMBOL(ieee80211_beacon_cntdwn_is_complete);
static int ieee80211_beacon_protect(struct sk_buff *skb,
struct ieee80211_local *local,
@@ -4761,11 +4756,11 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
beacon = rcu_dereference(ap->beacon);
if (beacon) {
- if (beacon->csa_counter_offsets[0]) {
+ if (beacon->cntdwn_counter_offsets[0]) {
if (!is_template)
- __ieee80211_csa_update_counter(beacon);
+ ieee80211_beacon_update_cntdwn(vif);
- ieee80211_set_csa(sdata, beacon);
+ ieee80211_set_beacon_cntdwn(sdata, beacon);
}
/*
@@ -4809,11 +4804,11 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
if (!beacon)
goto out;
- if (beacon->csa_counter_offsets[0]) {
+ if (beacon->cntdwn_counter_offsets[0]) {
if (!is_template)
- __ieee80211_csa_update_counter(beacon);
+ __ieee80211_beacon_update_cntdwn(beacon);
- ieee80211_set_csa(sdata, beacon);
+ ieee80211_set_beacon_cntdwn(sdata, beacon);
}
skb = dev_alloc_skb(local->tx_headroom + beacon->head_len +
@@ -4833,16 +4828,16 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
if (!beacon)
goto out;
- if (beacon->csa_counter_offsets[0]) {
+ if (beacon->cntdwn_counter_offsets[0]) {
if (!is_template)
/* TODO: For mesh csa_counter is in TU, so
* decrementing it by one isn't correct, but
* for now we leave it consistent with overall
* mac80211's behavior.
*/
- __ieee80211_csa_update_counter(beacon);
+ __ieee80211_beacon_update_cntdwn(beacon);
- ieee80211_set_csa(sdata, beacon);
+ ieee80211_set_beacon_cntdwn(sdata, beacon);
}
if (ifmsh->sync_ops)
@@ -4874,13 +4869,13 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
if (offs && beacon) {
int i;
- for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; i++) {
- u16 csa_off = beacon->csa_counter_offsets[i];
+ for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; i++) {
+ u16 csa_off = beacon->cntdwn_counter_offsets[i];
if (!csa_off)
continue;
- offs->csa_counter_offs[i] = csa_off_base + csa_off;
+ offs->cntdwn_counter_offs[i] = csa_off_base + csa_off;
}
}
@@ -4999,6 +4994,63 @@ out:
}
EXPORT_SYMBOL(ieee80211_proberesp_get);
+struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif)
+{
+ struct sk_buff *skb = NULL;
+ struct fils_discovery_data *tmpl = NULL;
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP)
+ return NULL;
+
+ rcu_read_lock();
+ tmpl = rcu_dereference(sdata->u.ap.fils_discovery);
+ if (!tmpl) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len);
+ if (skb) {
+ skb_reserve(skb, sdata->local->hw.extra_tx_headroom);
+ skb_put_data(skb, tmpl->data, tmpl->len);
+ }
+
+ rcu_read_unlock();
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_get_fils_discovery_tmpl);
+
+struct sk_buff *
+ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif)
+{
+ struct sk_buff *skb = NULL;
+ struct unsol_bcast_probe_resp_data *tmpl = NULL;
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP)
+ return NULL;
+
+ rcu_read_lock();
+ tmpl = rcu_dereference(sdata->u.ap.unsol_bcast_probe_resp);
+ if (!tmpl) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len);
+ if (skb) {
+ skb_reserve(skb, sdata->local->hw.extra_tx_headroom);
+ skb_put_data(skb, tmpl->data, tmpl->len);
+ }
+
+ rcu_read_unlock();
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_get_unsol_bcast_probe_resp_tmpl);
+
struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
struct ieee80211_vif *vif)
{
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 8d3bfc0fe176..8d3ae6b2f95f 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -45,6 +45,58 @@ struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy)
}
EXPORT_SYMBOL(wiphy_to_ieee80211_hw);
+u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
+ enum nl80211_iftype type)
+{
+ __le16 fc = hdr->frame_control;
+
+ if (ieee80211_is_data(fc)) {
+ if (len < 24) /* drop incorrect hdr len (data) */
+ return NULL;
+
+ if (ieee80211_has_a4(fc))
+ return NULL;
+ if (ieee80211_has_tods(fc))
+ return hdr->addr1;
+ if (ieee80211_has_fromds(fc))
+ return hdr->addr2;
+
+ return hdr->addr3;
+ }
+
+ if (ieee80211_is_s1g_beacon(fc)) {
+ struct ieee80211_ext *ext = (void *) hdr;
+
+ return ext->u.s1g_beacon.sa;
+ }
+
+ if (ieee80211_is_mgmt(fc)) {
+ if (len < 24) /* drop incorrect hdr len (mgmt) */
+ return NULL;
+ return hdr->addr3;
+ }
+
+ if (ieee80211_is_ctl(fc)) {
+ if (ieee80211_is_pspoll(fc))
+ return hdr->addr1;
+
+ if (ieee80211_is_back_req(fc)) {
+ switch (type) {
+ case NL80211_IFTYPE_STATION:
+ return hdr->addr2;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ return hdr->addr1;
+ default:
+ break; /* fall through to the return */
+ }
+ }
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(ieee80211_get_bssid);
+
void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx)
{
struct sk_buff *skb;
@@ -334,9 +386,10 @@ _ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags)
rcu_read_unlock();
}
-void ieee80211_wake_txqs(unsigned long data)
+void ieee80211_wake_txqs(struct tasklet_struct *t)
{
- struct ieee80211_local *local = (struct ieee80211_local *)data;
+ struct ieee80211_local *local = from_tasklet(local, t,
+ wake_txqs_tasklet);
unsigned long flags;
spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
@@ -733,6 +786,9 @@ static void __iterate_interfaces(struct ieee80211_local *local,
if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) &&
active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
continue;
+ if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) &&
+ !(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
+ continue;
if (ieee80211_sdata_running(sdata) || !active_only)
iterator(data, sdata->vif.addr,
&sdata->vif);
@@ -1003,6 +1059,11 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
case WLAN_EID_LINK_ID:
case WLAN_EID_BSS_MAX_IDLE_PERIOD:
case WLAN_EID_RSNX:
+ case WLAN_EID_S1G_BCN_COMPAT:
+ case WLAN_EID_S1G_CAPABILITIES:
+ case WLAN_EID_S1G_OPERATION:
+ case WLAN_EID_AID_RESPONSE:
+ case WLAN_EID_S1G_SHORT_BCN_INTERVAL:
/*
* not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible
* that if the content gets bigger it might be needed more than once
@@ -1288,6 +1349,30 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
&crc : NULL,
elem, elems);
break;
+ case WLAN_EID_S1G_CAPABILITIES:
+ if (elen == sizeof(*elems->s1g_capab))
+ elems->s1g_capab = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_S1G_OPERATION:
+ if (elen == sizeof(*elems->s1g_oper))
+ elems->s1g_oper = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_S1G_BCN_COMPAT:
+ if (elen == sizeof(*elems->s1g_bcn_compat))
+ elems->s1g_bcn_compat = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_AID_RESPONSE:
+ if (elen == sizeof(struct ieee80211_aid_response_ie))
+ elems->aid_resp = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
default:
break;
}
@@ -2429,7 +2514,6 @@ int ieee80211_reconfig(struct ieee80211_local *local)
return res;
}
break;
- case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_AP_VLAN:
case NL80211_IFTYPE_MONITOR:
case NL80211_IFTYPE_P2P_DEVICE:
@@ -2439,6 +2523,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
case NUM_NL80211_IFTYPES:
case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_WDS:
WARN_ON(1);
break;
}
@@ -3374,6 +3459,42 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
return true;
}
+bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
+ struct cfg80211_chan_def *chandef)
+{
+ u32 oper_freq;
+
+ if (!oper)
+ return false;
+
+ switch (FIELD_GET(S1G_OPER_CH_WIDTH_OPER, oper->ch_width)) {
+ case IEEE80211_S1G_CHANWIDTH_1MHZ:
+ chandef->width = NL80211_CHAN_WIDTH_1;
+ break;
+ case IEEE80211_S1G_CHANWIDTH_2MHZ:
+ chandef->width = NL80211_CHAN_WIDTH_2;
+ break;
+ case IEEE80211_S1G_CHANWIDTH_4MHZ:
+ chandef->width = NL80211_CHAN_WIDTH_4;
+ break;
+ case IEEE80211_S1G_CHANWIDTH_8MHZ:
+ chandef->width = NL80211_CHAN_WIDTH_8;
+ break;
+ case IEEE80211_S1G_CHANWIDTH_16MHZ:
+ chandef->width = NL80211_CHAN_WIDTH_16;
+ break;
+ default:
+ return false;
+ }
+
+ oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch,
+ NL80211_BAND_S1GHZ);
+ chandef->center_freq1 = KHZ_TO_MHZ(oper_freq);
+ chandef->freq1_offset = oper_freq % 1000;
+
+ return true;
+}
+
int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
const struct ieee80211_supported_band *sband,
const u8 *srates, int srates_len, u32 *rates)
@@ -3545,6 +3666,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
u64 ts = status->mactime;
struct rate_info ri;
u16 rate;
+ u8 n_ltf;
if (WARN_ON(!ieee80211_have_rx_timestamp(status)))
return 0;
@@ -3555,11 +3677,58 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
/* Fill cfg80211 rate info */
switch (status->encoding) {
+ case RX_ENC_HE:
+ ri.flags |= RATE_INFO_FLAGS_HE_MCS;
+ ri.mcs = status->rate_idx;
+ ri.nss = status->nss;
+ ri.he_ru_alloc = status->he_ru;
+ if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
+ ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
+
+ /*
+ * See P802.11ax_D6.0, section 27.3.4 for
+ * VHT PPDU format.
+ */
+ if (status->flag & RX_FLAG_MACTIME_PLCP_START) {
+ mpdu_offset += 2;
+ ts += 36;
+
+ /*
+ * TODO:
+ * For HE MU PPDU, add the HE-SIG-B.
+ * For HE ER PPDU, add 8us for the HE-SIG-A.
+ * For HE TB PPDU, add 4us for the HE-STF.
+ * Add the HE-LTF durations - variable.
+ */
+ }
+
+ break;
case RX_ENC_HT:
ri.mcs = status->rate_idx;
ri.flags |= RATE_INFO_FLAGS_MCS;
if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
+
+ /*
+ * See P802.11REVmd_D3.0, section 19.3.2 for
+ * HT PPDU format.
+ */
+ if (status->flag & RX_FLAG_MACTIME_PLCP_START) {
+ mpdu_offset += 2;
+ if (status->enc_flags & RX_ENC_FLAG_HT_GF)
+ ts += 24;
+ else
+ ts += 32;
+
+ /*
+ * Add Data HT-LTFs per streams
+ * TODO: add Extension HT-LTFs, 4us per LTF
+ */
+ n_ltf = ((ri.mcs >> 3) & 3) + 1;
+ n_ltf = n_ltf == 3 ? 4 : n_ltf;
+ ts += n_ltf * 4;
+ }
+
break;
case RX_ENC_VHT:
ri.flags |= RATE_INFO_FLAGS_VHT_MCS;
@@ -3567,6 +3736,23 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
ri.nss = status->nss;
if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
+
+ /*
+ * See P802.11REVmd_D3.0, section 21.3.2 for
+ * VHT PPDU format.
+ */
+ if (status->flag & RX_FLAG_MACTIME_PLCP_START) {
+ mpdu_offset += 2;
+ ts += 36;
+
+ /*
+ * Add VHT-LTFs per streams
+ */
+ n_ltf = (ri.nss != 1) && (ri.nss % 2) ?
+ ri.nss + 1 : ri.nss;
+ ts += 4 * n_ltf;
+ }
+
break;
default:
WARN_ON(1);
@@ -3590,7 +3776,6 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
ri.legacy = DIV_ROUND_UP(bitrate, (1 << shift));
if (status->flag & RX_FLAG_MACTIME_PLCP_START) {
- /* TODO: handle HT/VHT preambles */
if (status->band == NL80211_BAND_5GHZ) {
ts += 20 << shift;
mpdu_offset += 2;
@@ -4277,6 +4462,58 @@ int ieee80211_max_num_channels(struct ieee80211_local *local)
return max_num_different_channels;
}
+void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta_s1g_cap *caps,
+ struct sk_buff *skb)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_s1g_cap s1g_capab;
+ u8 *pos;
+ int i;
+
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
+ return;
+
+ if (!caps->s1g)
+ return;
+
+ memcpy(s1g_capab.capab_info, caps->cap, sizeof(caps->cap));
+ memcpy(s1g_capab.supp_mcs_nss, caps->nss_mcs, sizeof(caps->nss_mcs));
+
+ /* override the capability info */
+ for (i = 0; i < sizeof(ifmgd->s1g_capa.capab_info); i++) {
+ u8 mask = ifmgd->s1g_capa_mask.capab_info[i];
+
+ s1g_capab.capab_info[i] &= ~mask;
+ s1g_capab.capab_info[i] |= ifmgd->s1g_capa.capab_info[i] & mask;
+ }
+
+ /* then MCS and NSS set */
+ for (i = 0; i < sizeof(ifmgd->s1g_capa.supp_mcs_nss); i++) {
+ u8 mask = ifmgd->s1g_capa_mask.supp_mcs_nss[i];
+
+ s1g_capab.supp_mcs_nss[i] &= ~mask;
+ s1g_capab.supp_mcs_nss[i] |=
+ ifmgd->s1g_capa.supp_mcs_nss[i] & mask;
+ }
+
+ pos = skb_put(skb, 2 + sizeof(s1g_capab));
+ *pos++ = WLAN_EID_S1G_CAPABILITIES;
+ *pos++ = sizeof(s1g_capab);
+
+ memcpy(pos, &s1g_capab, sizeof(s1g_capab));
+}
+
+void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ u8 *pos = skb_put(skb, 3);
+
+ *pos++ = WLAN_EID_AID_REQUEST;
+ *pos++ = 1;
+ *pos++ = 0;
+}
+
u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo)
{
*buf++ = WLAN_EID_VENDOR_SPECIFIC;
@@ -4319,3 +4556,24 @@ const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS] = {
IEEE80211_WMM_IE_STA_QOSINFO_AC_BE,
IEEE80211_WMM_IE_STA_QOSINFO_AC_BK
};
+
+u16 ieee80211_encode_usf(int listen_interval)
+{
+ static const int listen_int_usf[] = { 1, 10, 1000, 10000 };
+ u16 ui, usf = 0;
+
+ /* find greatest USF */
+ while (usf < IEEE80211_MAX_USF) {
+ if (listen_interval % listen_int_usf[usf + 1])
+ break;
+ usf += 1;
+ }
+ ui = listen_interval / listen_int_usf[usf];
+
+ /* error if there is a remainder. Should've been checked by user */
+ WARN_ON_ONCE(ui > IEEE80211_MAX_UI);
+ listen_interval = FIELD_PREP(LISTEN_INT_USF, usf) |
+ FIELD_PREP(LISTEN_INT_UI, ui);
+
+ return (u16) listen_interval;
+}
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index d1b64d0751f2..c3ca97373774 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -315,10 +315,6 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta);
- /* If HT IE reported 3839 bytes only, stay with that size. */
- if (sta->sta.max_amsdu_len == IEEE80211_MAX_MPDU_LEN_HT_3839)
- return;
-
switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) {
case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454;
@@ -469,12 +465,18 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta)
* IEEE80211-2016 specification makes higher bandwidth operation
* possible on the TDLS link if the peers have wider bandwidth
* capability.
+ *
+ * However, in this case, and only if the TDLS peer is authorized,
+ * limit to the tdls_chandef so that the configuration here isn't
+ * wider than what's actually requested on the channel context.
*/
if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) &&
- test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW))
- return bw;
-
- bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width));
+ test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) &&
+ test_sta_flag(sta, WLAN_STA_AUTHORIZED) &&
+ sta->tdls_chandef.chan)
+ bw = min(bw, ieee80211_chan_width_to_rx_bw(sta->tdls_chandef.width));
+ else
+ bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width));
return bw;
}
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 2fb99325135a..9ea6004abe1b 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -118,9 +118,11 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata,
struct ieee80211_hdr *hdr)
{
struct ieee80211_local *local = sdata->local;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
u8 *p;
- if (local->hw.queues < IEEE80211_NUM_ACS)
+ if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) ||
+ local->hw.queues < IEEE80211_NUM_ACS)
return 0;
if (!ieee80211_is_data(hdr->frame_control)) {
@@ -141,6 +143,7 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata,
u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, struct sk_buff *skb)
{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
struct mac80211_qos_map *qos_map;
bool qos;
@@ -153,7 +156,7 @@ u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
else
qos = false;
- if (!qos) {
+ if (!qos || (info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER)) {
skb->priority = 0; /* required for correct WPA/11i MIC */
return IEEE80211_AC_BE;
}
@@ -202,9 +205,6 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_AP:
ra = skb->data;
break;
- case NL80211_IFTYPE_WDS:
- ra = sdata->u.wds.remote_addr;
- break;
case NL80211_IFTYPE_STATION:
/* might be a TDLS station */
sta = sta_info_get(sdata, skb->data);
@@ -249,6 +249,14 @@ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata,
p = ieee80211_get_qos_ctl(hdr);
+ /* don't overwrite the QoS field of injected frames */
+ if (info->flags & IEEE80211_TX_CTL_INJECTED) {
+ /* do take into account Ack policy of injected frames */
+ if (*p & IEEE80211_QOS_CTL_ACK_POLICY_NOACK)
+ info->flags |= IEEE80211_TX_CTL_NO_ACK;
+ return;
+ }
+
/* set up the first byte */
/*
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index 06ea0f8bfd5c..520cedc594e1 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -20,9 +20,9 @@
#include "ieee802154_i.h"
#include "cfg.h"
-static void ieee802154_tasklet_handler(unsigned long data)
+static void ieee802154_tasklet_handler(struct tasklet_struct *t)
{
- struct ieee802154_local *local = (struct ieee802154_local *)data;
+ struct ieee802154_local *local = from_tasklet(local, t, tasklet);
struct sk_buff *skb;
while ((skb = skb_dequeue(&local->skb_queue))) {
@@ -91,9 +91,7 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops)
INIT_LIST_HEAD(&local->interfaces);
mutex_init(&local->iflist_mtx);
- tasklet_init(&local->tasklet,
- ieee802154_tasklet_handler,
- (unsigned long)local);
+ tasklet_setup(&local->tasklet, ieee802154_tasklet_handler);
skb_queue_head_init(&local->skb_queue);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index f2868a8a50c3..47bab701555f 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -377,6 +377,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
if (!pskb_may_pull(skb, sizeof(*hdr)))
goto err;
+ skb_dst_drop(skb);
+
/* Read and decode the label */
hdr = mpls_hdr(skb);
dec = mpls_entry_decode(hdr);
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 2def85718d94..ef59e25dc482 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -300,5 +300,6 @@ static void __exit mpls_iptunnel_exit(void)
module_exit(mpls_iptunnel_exit);
MODULE_ALIAS_RTNL_LWT(MPLS);
+MODULE_SOFTDEP("post: mpls_gso");
MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
MODULE_LICENSE("GPL v2");
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
index 698bc3525160..a014149aa323 100644
--- a/net/mptcp/Kconfig
+++ b/net/mptcp/Kconfig
@@ -19,14 +19,11 @@ config INET_MPTCP_DIAG
config MPTCP_IPV6
bool "MPTCP: IPv6 support for Multipath TCP"
- select IPV6
+ depends on IPV6=y
default y
-endif
-
config MPTCP_KUNIT_TESTS
tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS
- select MPTCP
depends on KUNIT
default KUNIT_ALL_TESTS
help
@@ -39,3 +36,4 @@ config MPTCP_KUNIT_TESTS
If unsure, say N.
+endif
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index 05d398d3fde4..b472dc149856 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -21,7 +21,7 @@
*/
#include <linux/kernel.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <asm/unaligned.h>
#include "protocol.h"
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index 54b888f94009..96ba616f59bf 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -18,6 +18,7 @@ struct mptcp_pernet {
struct ctl_table_header *ctl_table_hdr;
int mptcp_enabled;
+ unsigned int add_addr_timeout;
};
static struct mptcp_pernet *mptcp_get_pernet(struct net *net)
@@ -30,6 +31,11 @@ int mptcp_is_enabled(struct net *net)
return mptcp_get_pernet(net)->mptcp_enabled;
}
+unsigned int mptcp_get_add_addr_timeout(struct net *net)
+{
+ return mptcp_get_pernet(net)->add_addr_timeout;
+}
+
static struct ctl_table mptcp_sysctl_table[] = {
{
.procname = "enabled",
@@ -40,12 +46,19 @@ static struct ctl_table mptcp_sysctl_table[] = {
*/
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "add_addr_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
{}
};
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
+ pernet->add_addr_timeout = TCP_RTO_MAX;
}
static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
@@ -61,6 +74,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
}
table[0].data = &pernet->mptcp_enabled;
+ table[1].data = &pernet->add_addr_timeout;
hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 0a6a15f3456d..b921cbdd9aaa 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -22,6 +22,15 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
+ SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL),
+ SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE),
+ SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE),
+ SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW),
+ SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA),
+ SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR),
+ SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD),
+ SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR),
+ SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
SNMP_MIB_SENTINEL
};
@@ -58,6 +67,7 @@ void mptcp_seq_show(struct seq_file *seq)
for (i = 0; mptcp_snmp_list[i].name; i++)
seq_puts(seq, " 0");
+ seq_putc(seq, '\n');
return;
}
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index d7de340fc997..47bcecce1106 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -15,6 +15,15 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
+ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */
+ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */
+ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */
+ MPTCP_MIB_NODSSWINDOW, /* Segments not in MPTCP windows */
+ MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */
+ MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */
+ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */
+ MPTCP_MIB_RMADDR, /* Received RM_ADDR */
+ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index 5f390a97f556..b70ae4ba3000 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -140,7 +140,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
info->mptcpi_flags = flags;
info->mptcpi_token = READ_ONCE(msk->token);
info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
- info->mptcpi_snd_una = atomic64_read(&msk->snd_una);
+ info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
unlock_sock_fast(sk, slow);
}
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 888bbbbb3e8a..e0d21c0607e5 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -7,10 +7,11 @@
#define pr_fmt(fmt) "MPTCP: " fmt
#include <linux/kernel.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <net/tcp.h>
#include <net/mptcp.h>
#include "protocol.h"
+#include "mib.h"
static bool mptcp_cap_flag_sha256(u8 flags)
{
@@ -240,9 +241,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
}
mp_opt->add_addr = 1;
- mp_opt->port = 0;
mp_opt->addr_id = *ptr++;
- pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id);
if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
ptr += 4;
@@ -267,6 +266,9 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->ahmac = get_unaligned_be64(ptr);
ptr += 8;
}
+ pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d",
+ (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "",
+ mp_opt->addr_id, mp_opt->ahmac, mp_opt->echo, mp_opt->port);
break;
case MPTCPOPT_RM_ADDR:
@@ -280,6 +282,16 @@ static void mptcp_parse_option(const struct sk_buff *skb,
pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
break;
+ case MPTCPOPT_MP_FASTCLOSE:
+ if (opsize != TCPOLEN_MPTCP_FASTCLOSE)
+ break;
+
+ ptr += 2;
+ mp_opt->rcvr_key = get_unaligned_be64(ptr);
+ ptr += 8;
+ mp_opt->fastclose = 1;
+ break;
+
default:
break;
}
@@ -296,6 +308,9 @@ void mptcp_get_options(const struct sk_buff *skb,
mp_opt->mp_capable = 0;
mp_opt->mp_join = 0;
mp_opt->add_addr = 0;
+ mp_opt->ahmac = 0;
+ mp_opt->fastclose = 0;
+ mp_opt->port = 0;
mp_opt->rm_addr = 0;
mp_opt->dss = 0;
@@ -490,7 +505,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
bool ret = false;
mpext = skb ? mptcp_get_ext(skb) : NULL;
- snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable);
+ snd_data_fin_enable = mptcp_data_fin_enabled(msk);
if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) {
unsigned int map_size;
@@ -516,7 +531,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
return ret;
}
- if (subflow->use_64bit_ack) {
+ if (READ_ONCE(msk->use_64bit_ack)) {
ack_size = TCPOLEN_MPTCP_DSS_ACK64;
opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq);
opts->ext_copy.ack64 = 1;
@@ -526,6 +541,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
opts->ext_copy.ack64 = 0;
}
opts->ext_copy.use_ack = 1;
+ WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
/* Add kind/length/subtype/flag overhead if mapping is not populated */
if (dss_size == 0)
@@ -571,45 +587,94 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
}
#endif
-static bool mptcp_established_options_addr(struct sock *sk,
- unsigned int *size,
- unsigned int remaining,
- struct mptcp_out_options *opts)
+static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ bool drop_other_suboptions = false;
+ unsigned int opt_size = *size;
struct mptcp_addr_info saddr;
+ bool echo;
+ bool port;
int len;
- if (!mptcp_pm_should_signal(msk) ||
- !(mptcp_pm_addr_signal(msk, remaining, &saddr)))
+ if ((mptcp_pm_should_add_signal_ipv6(msk) ||
+ mptcp_pm_should_add_signal_port(msk)) &&
+ skb && skb_is_tcp_pure_ack(skb)) {
+ pr_debug("drop other suboptions");
+ opts->suboptions = 0;
+ opts->ext_copy.use_ack = 0;
+ opts->ext_copy.use_map = 0;
+ remaining += opt_size;
+ drop_other_suboptions = true;
+ }
+
+ if (!mptcp_pm_should_add_signal(msk) ||
+ !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo, &port)))
return false;
- len = mptcp_add_addr_len(saddr.family);
+ len = mptcp_add_addr_len(saddr.family, echo, port);
if (remaining < len)
return false;
*size = len;
+ if (drop_other_suboptions)
+ *size -= opt_size;
opts->addr_id = saddr.id;
+ if (port)
+ opts->port = ntohs(saddr.port);
if (saddr.family == AF_INET) {
opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
opts->addr = saddr.addr;
- opts->ahmac = add_addr_generate_hmac(msk->local_key,
- msk->remote_key,
- opts->addr_id,
- &opts->addr);
+ if (!echo) {
+ opts->ahmac = add_addr_generate_hmac(msk->local_key,
+ msk->remote_key,
+ opts->addr_id,
+ &opts->addr);
+ }
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (saddr.family == AF_INET6) {
opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
opts->addr6 = saddr.addr6;
- opts->ahmac = add_addr6_generate_hmac(msk->local_key,
- msk->remote_key,
- opts->addr_id,
- &opts->addr6);
+ if (!echo) {
+ opts->ahmac = add_addr6_generate_hmac(msk->local_key,
+ msk->remote_key,
+ opts->addr_id,
+ &opts->addr6);
+ }
}
#endif
- pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac);
+ pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d",
+ opts->addr_id, opts->ahmac, echo, opts->port);
+
+ return true;
+}
+
+static bool mptcp_established_options_rm_addr(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ u8 rm_id;
+
+ if (!mptcp_pm_should_rm_signal(msk) ||
+ !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id)))
+ return false;
+
+ if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE)
+ return false;
+
+ *size = TCPOLEN_MPTCP_RM_ADDR_BASE;
+ opts->suboptions |= OPTION_MPTCP_RM_ADDR;
+ opts->rm_id = rm_id;
+
+ pr_debug("rm_id=%d", opts->rm_id);
return true;
}
@@ -626,6 +691,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
if (unlikely(mptcp_check_fallback(sk)))
return false;
+ /* prevent adding of any MPTCP related options on reset packet
+ * until we support MP_TCPRST/MP_FASTCLOSE
+ */
+ if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
+ return false;
+
if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -640,7 +711,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
*size += opt_size;
remaining -= opt_size;
- if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) {
+ if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ ret = true;
+ } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) {
*size += opt_size;
remaining -= opt_size;
ret = true;
@@ -676,7 +751,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
return false;
}
-static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
+static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
struct mptcp_subflow_context *subflow,
struct sk_buff *skb,
struct mptcp_options_received *mp_opt)
@@ -693,15 +768,20 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
subflow->mp_join && mp_opt->mp_join &&
READ_ONCE(msk->pm.server_side))
- tcp_send_ack(sk);
+ tcp_send_ack(ssk);
goto fully_established;
}
- /* we should process OoO packets before the first subflow is fully
- * established, but not expected for MP_JOIN subflows
+ /* we must process OoO packets before the first subflow is fully
+ * established. OoO packets are instead a protocol violation
+ * for MP_JOIN subflows as the peer must not send any data
+ * before receiving the forth ack - cfr. RFC 8684 section 3.2.
*/
- if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)
+ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) {
+ if (subflow->mp_join)
+ goto reset;
return subflow->mp_capable;
+ }
if (mp_opt->dss && mp_opt->use_ack) {
/* subflows are fully established as soon as we get any
@@ -712,10 +792,18 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
goto fully_established;
}
+ if (mp_opt->add_addr) {
+ WRITE_ONCE(msk->fully_established, true);
+ return true;
+ }
+
/* If the first established packet does not contain MP_CAPABLE + data
- * then fallback to TCP
+ * then fallback to TCP. Fallback scenarios requires a reset for
+ * MP_JOIN subflows.
*/
if (!mp_opt->mp_capable) {
+ if (subflow->mp_join)
+ goto reset;
subflow->mp_capable = 0;
pr_fallback(msk);
__mptcp_do_fallback(msk);
@@ -727,17 +815,26 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
mptcp_subflow_fully_established(subflow, mp_opt);
fully_established:
- if (likely(subflow->pm_notified))
+ /* if the subflow is not already linked into the conn_list, we can't
+ * notify the PM: this subflow is still on the listener queue
+ * and the PM possibly acquiring the subflow lock could race with
+ * the listener close
+ */
+ if (likely(subflow->pm_notified) || list_empty(&subflow->node))
return true;
subflow->pm_notified = 1;
if (subflow->mp_join) {
- clear_3rdack_retransmission(sk);
+ clear_3rdack_retransmission(ssk);
mptcp_pm_subflow_established(msk, subflow);
} else {
mptcp_pm_fully_established(msk);
}
return true;
+
+reset:
+ mptcp_subflow_reset(ssk);
+ return false;
}
static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
@@ -755,31 +852,42 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
return cur_ack;
}
-static void update_una(struct mptcp_sock *msk,
- struct mptcp_options_received *mp_opt)
+static void ack_update_msk(struct mptcp_sock *msk,
+ struct sock *ssk,
+ struct mptcp_options_received *mp_opt)
{
- u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una);
- u64 write_seq = READ_ONCE(msk->write_seq);
+ u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt);
+ struct sock *sk = (struct sock *)msk;
+ u64 old_snd_una;
+
+ mptcp_data_lock(sk);
/* avoid ack expansion on update conflict, to reduce the risk of
* wrongly expanding to a future ack sequence number, which is way
* more dangerous than missing an ack
*/
+ old_snd_una = msk->snd_una;
new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
/* ACK for data not even sent yet? Ignore. */
- if (after64(new_snd_una, write_seq))
+ if (after64(new_snd_una, snd_nxt))
new_snd_una = old_snd_una;
- while (after64(new_snd_una, old_snd_una)) {
- snd_una = old_snd_una;
- old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una,
- new_snd_una);
- if (old_snd_una == snd_una) {
- mptcp_data_acked((struct sock *)msk);
- break;
- }
+ new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
+
+ if (after64(new_wnd_end, msk->wnd_end))
+ msk->wnd_end = new_wnd_end;
+
+ /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
+ if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)) &&
+ sk_stream_memory_free(ssk))
+ __mptcp_check_push(sk, ssk);
+
+ if (after64(new_snd_una, old_snd_una)) {
+ msk->snd_una = new_snd_una;
+ __mptcp_data_acked(sk);
}
+ mptcp_data_unlock(sk);
}
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
@@ -825,21 +933,37 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
return hmac == mp_opt->ahmac;
}
-void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
- struct tcp_options_received *opt_rx)
+void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_options_received mp_opt;
struct mptcp_ext *mpext;
- if (__mptcp_check_fallback(msk))
+ if (__mptcp_check_fallback(msk)) {
+ /* Keep it simple and unconditionally trigger send data cleanup and
+ * pending queue spooling. We will need to acquire the data lock
+ * for more accurate checks, and once the lock is acquired, such
+ * helpers are cheap.
+ */
+ mptcp_data_lock(subflow->conn);
+ if (sk_stream_memory_free(sk))
+ __mptcp_check_push(subflow->conn, sk);
+ __mptcp_data_acked(subflow->conn);
+ mptcp_data_unlock(subflow->conn);
return;
+ }
mptcp_get_options(skb, &mp_opt);
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
return;
+ if (mp_opt.fastclose &&
+ msk->local_key == mp_opt.rcvr_key) {
+ WRITE_ONCE(msk->rcv_fastclose, true);
+ mptcp_schedule_work((struct sock *)msk);
+ }
+
if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) {
struct mptcp_addr_info addr;
@@ -855,11 +979,21 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
addr.addr6 = mp_opt.addr6;
}
#endif
- if (!mp_opt.echo)
+ if (!mp_opt.echo) {
mptcp_pm_add_addr_received(msk, &addr);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR);
+ } else {
+ mptcp_pm_del_add_timer(msk, &addr);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD);
+ }
mp_opt.add_addr = 0;
}
+ if (mp_opt.rm_addr) {
+ mptcp_pm_rm_addr_received(msk, mp_opt.rm_id);
+ mp_opt.rm_addr = 0;
+ }
+
if (!mp_opt.dss)
return;
@@ -867,7 +1001,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
* monodirectional flows will stuck
*/
if (mp_opt.use_ack)
- update_una(msk, &mp_opt);
+ ack_update_msk(msk, sk, &mp_opt);
/* Zero-data-length packets are dropped by the caller and not
* propagated to the MPTCP layer, so the skb extension does not
@@ -912,7 +1046,24 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
}
}
-void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
+static void mptcp_set_rwin(const struct tcp_sock *tp)
+{
+ const struct sock *ssk = (const struct sock *)tp;
+ const struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk;
+ u64 ack_seq;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ msk = mptcp_sk(subflow->conn);
+
+ ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
+
+ if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
+ WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
+}
+
+void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
+ struct mptcp_out_options *opts)
{
if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
@@ -951,44 +1102,66 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
}
mp_capable_done:
- if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
- if (opts->ahmac)
- *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
- TCPOLEN_MPTCP_ADD_ADDR, 0,
- opts->addr_id);
- else
- *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
- TCPOLEN_MPTCP_ADD_ADDR_BASE,
- MPTCP_ADDR_ECHO,
- opts->addr_id);
- memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
- ptr += 1;
+ if ((OPTION_MPTCP_ADD_ADDR
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ | OPTION_MPTCP_ADD_ADDR6
+#endif
+ ) & opts->suboptions) {
+ u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
+ u8 echo = MPTCP_ADDR_ECHO;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions)
+ len = TCPOLEN_MPTCP_ADD_ADDR6_BASE;
+#endif
+
+ if (opts->port)
+ len += TCPOLEN_MPTCP_PORT_LEN;
+
if (opts->ahmac) {
- put_unaligned_be64(opts->ahmac, ptr);
- ptr += 2;
+ len += sizeof(opts->ahmac);
+ echo = 0;
}
- }
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ len, echo, opts->addr_id);
+ if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
+ memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
+ ptr += 1;
+ }
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
- if (opts->ahmac)
- *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
- TCPOLEN_MPTCP_ADD_ADDR6, 0,
- opts->addr_id);
- else
- *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
- TCPOLEN_MPTCP_ADD_ADDR6_BASE,
- MPTCP_ADDR_ECHO,
- opts->addr_id);
- memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
- ptr += 4;
- if (opts->ahmac) {
- put_unaligned_be64(opts->ahmac, ptr);
- ptr += 2;
+ else if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
+ memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
+ ptr += 4;
}
- }
#endif
+ if (!opts->port) {
+ if (opts->ahmac) {
+ put_unaligned_be64(opts->ahmac, ptr);
+ ptr += 2;
+ }
+ } else {
+ if (opts->ahmac) {
+ u8 *bptr = (u8 *)ptr;
+
+ put_unaligned_be16(opts->port, bptr);
+ bptr += 2;
+ put_unaligned_be64(opts->ahmac, bptr);
+ bptr += 8;
+ put_unaligned_be16(TCPOPT_NOP << 8 |
+ TCPOPT_NOP, bptr);
+
+ ptr += 3;
+ } else {
+ put_unaligned_be32(opts->port << 16 |
+ TCPOPT_NOP << 8 |
+ TCPOPT_NOP, ptr);
+ ptr += 1;
+ }
+ }
+ }
+
if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
TCPOLEN_MPTCP_RM_ADDR_BASE,
@@ -1069,4 +1242,7 @@ mp_capable_done:
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
}
}
+
+ if (tp)
+ mptcp_set_rwin(tp);
}
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index a8ad20559aaa..da2ed576f289 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -13,23 +13,55 @@
/* path manager command handlers */
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr)
+ const struct mptcp_addr_info *addr,
+ bool echo, bool port)
{
+ u8 add_addr = READ_ONCE(msk->pm.addr_signal);
+
pr_debug("msk=%p, local_id=%d", msk, addr->id);
+ if (add_addr) {
+ pr_warn("addr_signal error, add_addr=%d", add_addr);
+ return -EINVAL;
+ }
+
msk->pm.local = *addr;
- WRITE_ONCE(msk->pm.addr_signal, true);
+ add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL);
+ if (echo)
+ add_addr |= BIT(MPTCP_ADD_ADDR_ECHO);
+ if (addr->family == AF_INET6)
+ add_addr |= BIT(MPTCP_ADD_ADDR_IPV6);
+ if (port)
+ add_addr |= BIT(MPTCP_ADD_ADDR_PORT);
+ WRITE_ONCE(msk->pm.addr_signal, add_addr);
return 0;
}
int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id)
{
- return -ENOTSUPP;
+ u8 rm_addr = READ_ONCE(msk->pm.addr_signal);
+
+ pr_debug("msk=%p, local_id=%d", msk, local_id);
+
+ if (rm_addr) {
+ pr_warn("addr_signal error, rm_addr=%d", rm_addr);
+ return -EINVAL;
+ }
+
+ msk->pm.rm_id = local_id;
+ rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL);
+ WRITE_ONCE(msk->pm.addr_signal, rm_addr);
+ return 0;
}
-int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id)
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id)
{
- return -ENOTSUPP;
+ pr_debug("msk=%p, local_id=%d", msk, local_id);
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_nl_rm_subflow_received(msk, local_id);
+ spin_unlock_bh(&msk->pm.lock);
+ return 0;
}
/* path manager event handlers */
@@ -46,7 +78,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side)
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
- int ret;
+ int ret = 0;
pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
pm->subflows_max, READ_ONCE(pm->accept_subflow));
@@ -56,9 +88,11 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
return false;
spin_lock_bh(&pm->lock);
- ret = pm->subflows < pm->subflows_max;
- if (ret && ++pm->subflows == pm->subflows_max)
- WRITE_ONCE(pm->accept_subflow, false);
+ if (READ_ONCE(pm->accept_subflow)) {
+ ret = pm->subflows < pm->subflows_max;
+ if (ret && ++pm->subflows == pm->subflows_max)
+ WRITE_ONCE(pm->accept_subflow, false);
+ }
spin_unlock_bh(&pm->lock);
return ret;
@@ -76,8 +110,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
return false;
msk->pm.status |= BIT(new_status);
- if (schedule_work(&msk->work))
- sock_hold((struct sock *)msk);
+ mptcp_schedule_work((struct sock *)msk);
return true;
}
@@ -93,8 +126,14 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk)
spin_lock_bh(&pm->lock);
- if (READ_ONCE(pm->work_pending))
+ /* mptcp_pm_fully_established() can be invoked by multiple
+ * racing paths - accept() and check_fully_established()
+ * be sure to serve this event only once.
+ */
+ if (READ_ONCE(pm->work_pending) &&
+ !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
+ msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
spin_unlock_bh(&pm->lock);
}
@@ -135,38 +174,83 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
READ_ONCE(pm->accept_addr));
- /* avoid acquiring the lock if there is no room for fouther addresses */
- if (!READ_ONCE(pm->accept_addr))
- return;
-
spin_lock_bh(&pm->lock);
- /* be sure there is something to signal re-checking under PM lock */
- if (READ_ONCE(pm->accept_addr) &&
- mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED))
+ if (!READ_ONCE(pm->accept_addr)) {
+ mptcp_pm_announce_addr(msk, addr, true, addr->port);
+ mptcp_pm_add_addr_send_ack(msk);
+ } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
pm->remote = *addr;
+ }
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk)
+{
+ if (!mptcp_pm_should_add_signal_ipv6(msk) &&
+ !mptcp_pm_should_add_signal_port(msk))
+ return;
+
+ mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK);
+}
+
+void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p remote_id=%d", msk, rm_id);
+ spin_lock_bh(&pm->lock);
+ mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED);
+ pm->rm_id = rm_id;
spin_unlock_bh(&pm->lock);
}
/* path manager helpers */
-bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
- struct mptcp_addr_info *saddr)
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_addr_info *saddr, bool *echo, bool *port)
{
int ret = false;
spin_lock_bh(&msk->pm.lock);
/* double check after the lock is acquired */
- if (!mptcp_pm_should_signal(msk))
+ if (!mptcp_pm_should_add_signal(msk))
goto out_unlock;
- if (remaining < mptcp_add_addr_len(msk->pm.local.family))
+ *echo = mptcp_pm_should_add_signal_echo(msk);
+ *port = mptcp_pm_should_add_signal_port(msk);
+
+ if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo, *port))
goto out_unlock;
*saddr = msk->pm.local;
- WRITE_ONCE(msk->pm.addr_signal, false);
+ WRITE_ONCE(msk->pm.addr_signal, 0);
+ ret = true;
+
+out_unlock:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ u8 *rm_id)
+{
+ int ret = false;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ /* double check after the lock is acquired */
+ if (!mptcp_pm_should_rm_signal(msk))
+ goto out_unlock;
+
+ if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE)
+ goto out_unlock;
+
+ *rm_id = msk->pm.rm_id;
+ WRITE_ONCE(msk->pm.addr_signal, 0);
ret = true;
out_unlock:
@@ -185,13 +269,15 @@ void mptcp_pm_data_init(struct mptcp_sock *msk)
msk->pm.add_addr_accepted = 0;
msk->pm.local_addr_used = 0;
msk->pm.subflows = 0;
+ msk->pm.rm_id = 0;
WRITE_ONCE(msk->pm.work_pending, false);
- WRITE_ONCE(msk->pm.addr_signal, false);
+ WRITE_ONCE(msk->pm.addr_signal, 0);
WRITE_ONCE(msk->pm.accept_addr, false);
WRITE_ONCE(msk->pm.accept_subflow, false);
msk->pm.status = 0;
spin_lock_init(&msk->pm.lock);
+ INIT_LIST_HEAD(&msk->pm.anno_list);
mptcp_pm_nl_data_init(msk);
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 770da3627848..a6d983d80576 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -15,6 +15,7 @@
#include <uapi/linux/mptcp.h>
#include "protocol.h"
+#include "mib.h"
/* forward declaration */
static struct genl_family mptcp_genl_family;
@@ -23,12 +24,18 @@ static int pm_nl_pernet_id;
struct mptcp_pm_addr_entry {
struct list_head list;
- unsigned int flags;
- int ifindex;
struct mptcp_addr_info addr;
struct rcu_head rcu;
};
+struct mptcp_pm_add_entry {
+ struct list_head list;
+ struct mptcp_addr_info addr;
+ struct timer_list add_timer;
+ struct mptcp_sock *sock;
+ u8 retrans_times;
+};
+
struct pm_nl_pernet {
/* protects pernet updates */
spinlock_t lock;
@@ -42,6 +49,7 @@ struct pm_nl_pernet {
};
#define MPTCP_PM_ADDR_MAX 8
+#define ADD_ADDR_RETRANS_MAX 3
static bool addresses_equal(const struct mptcp_addr_info *a,
struct mptcp_addr_info *b, bool use_port)
@@ -127,22 +135,20 @@ select_local_address(const struct pm_nl_pernet *pernet,
struct mptcp_pm_addr_entry *entry, *ret = NULL;
rcu_read_lock();
- spin_lock_bh(&msk->join_list_lock);
+ __mptcp_flush_join_list(msk);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
+ if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
/* avoid any address already in use by subflows and
* pending join
*/
if (entry->addr.family == ((struct sock *)msk)->sk_family &&
- !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
- !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) {
+ !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) {
ret = entry;
break;
}
}
- spin_unlock_bh(&msk->join_list_lock);
rcu_read_unlock();
return ret;
}
@@ -160,7 +166,7 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
* can lead to additional addresses not being announced.
*/
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
+ if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
continue;
if (i++ == pos) {
ret = entry;
@@ -179,6 +185,125 @@ static void check_work_pending(struct mptcp_sock *msk)
WRITE_ONCE(msk->pm.work_pending, false);
}
+static struct mptcp_pm_add_entry *
+lookup_anno_list_by_saddr(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *entry;
+
+ list_for_each_entry(entry, &msk->pm.anno_list, list) {
+ if (addresses_equal(&entry->addr, addr, false))
+ return entry;
+ }
+
+ return NULL;
+}
+
+static void mptcp_pm_add_timer(struct timer_list *timer)
+{
+ struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer);
+ struct mptcp_sock *msk = entry->sock;
+ struct sock *sk = (struct sock *)msk;
+
+ pr_debug("msk=%p", msk);
+
+ if (!msk)
+ return;
+
+ if (inet_sk_state_load(sk) == TCP_CLOSE)
+ return;
+
+ if (!entry->addr.id)
+ return;
+
+ if (mptcp_pm_should_add_signal(msk)) {
+ sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
+ goto out;
+ }
+
+ spin_lock_bh(&msk->pm.lock);
+
+ if (!mptcp_pm_should_add_signal(msk)) {
+ pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id);
+ mptcp_pm_announce_addr(msk, &entry->addr, false, entry->addr.port);
+ mptcp_pm_add_addr_send_ack(msk);
+ entry->retrans_times++;
+ }
+
+ if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
+ sk_reset_timer(sk, timer,
+ jiffies + mptcp_get_add_addr_timeout(sock_net(sk)));
+
+ spin_unlock_bh(&msk->pm.lock);
+
+out:
+ __sock_put(sk);
+}
+
+struct mptcp_pm_add_entry *
+mptcp_pm_del_add_timer(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *entry;
+ struct sock *sk = (struct sock *)msk;
+
+ spin_lock_bh(&msk->pm.lock);
+ entry = lookup_anno_list_by_saddr(msk, addr);
+ if (entry)
+ entry->retrans_times = ADD_ADDR_RETRANS_MAX;
+ spin_unlock_bh(&msk->pm.lock);
+
+ if (entry)
+ sk_stop_timer_sync(sk, &entry->add_timer);
+
+ return entry;
+}
+
+static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_pm_add_entry *add_entry = NULL;
+ struct sock *sk = (struct sock *)msk;
+ struct net *net = sock_net(sk);
+
+ if (lookup_anno_list_by_saddr(msk, &entry->addr))
+ return false;
+
+ add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC);
+ if (!add_entry)
+ return false;
+
+ list_add(&add_entry->list, &msk->pm.anno_list);
+
+ add_entry->addr = entry->addr;
+ add_entry->sock = msk;
+ add_entry->retrans_times = 0;
+
+ timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
+ sk_reset_timer(sk, &add_entry->add_timer,
+ jiffies + mptcp_get_add_addr_timeout(net));
+
+ return true;
+}
+
+void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_add_entry *entry, *tmp;
+ struct sock *sk = (struct sock *)msk;
+ LIST_HEAD(free_list);
+
+ pr_debug("msk=%p", msk);
+
+ spin_lock_bh(&msk->pm.lock);
+ list_splice_init(&msk->pm.anno_list, &free_list);
+ spin_unlock_bh(&msk->pm.lock);
+
+ list_for_each_entry_safe(entry, tmp, &free_list, list) {
+ sk_stop_timer_sync(sk, &entry->add_timer);
+ kfree(entry);
+ }
+}
+
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{
struct mptcp_addr_info remote = { 0 };
@@ -186,7 +311,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
struct mptcp_pm_addr_entry *local;
struct pm_nl_pernet *pernet;
- pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ pernet = net_generic(sock_net(sk), pm_nl_pernet_id);
pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
msk->pm.local_addr_used, msk->pm.local_addr_max,
@@ -199,8 +324,11 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
msk->pm.add_addr_signaled);
if (local) {
- msk->pm.add_addr_signaled++;
- mptcp_pm_announce_addr(msk, &local->addr);
+ if (mptcp_pm_alloc_anno_list(msk, local)) {
+ msk->pm.add_addr_signaled++;
+ mptcp_pm_announce_addr(msk, &local->addr, false, local->addr.port);
+ mptcp_pm_nl_add_addr_send_ack(msk);
+ }
} else {
/* pick failed, avoid fourther attempts later */
msk->pm.local_addr_used = msk->pm.add_addr_signal_max;
@@ -220,8 +348,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
msk->pm.subflows++;
check_work_pending(msk);
spin_unlock_bh(&msk->pm.lock);
- __mptcp_subflow_connect(sk, local->ifindex,
- &local->addr, &remote);
+ __mptcp_subflow_connect(sk, &local->addr, &remote);
spin_lock_bh(&msk->pm.lock);
return;
}
@@ -247,6 +374,7 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
struct sock *sk = (struct sock *)msk;
struct mptcp_addr_info remote;
struct mptcp_addr_info local;
+ bool use_port = false;
pr_debug("accepted %d:%d remote family %d",
msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max,
@@ -263,17 +391,125 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
remote = msk->pm.remote;
if (!remote.port)
remote.port = sk->sk_dport;
+ else
+ use_port = true;
memset(&local, 0, sizeof(local));
local.family = remote.family;
spin_unlock_bh(&msk->pm.lock);
- __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote);
+ __mptcp_subflow_connect(sk, &local, &remote);
spin_lock_bh(&msk->pm.lock);
+
+ mptcp_pm_announce_addr(msk, &remote, true, use_port);
+ mptcp_pm_nl_add_addr_send_ack(msk);
+}
+
+void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (!mptcp_pm_should_add_signal_ipv6(msk) &&
+ !mptcp_pm_should_add_signal_port(msk))
+ return;
+
+ __mptcp_flush_join_list(msk);
+ subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node);
+ if (subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ u8 add_addr;
+
+ spin_unlock_bh(&msk->pm.lock);
+ if (mptcp_pm_should_add_signal_ipv6(msk))
+ pr_debug("send ack for add_addr6");
+ if (mptcp_pm_should_add_signal_port(msk))
+ pr_debug("send ack for add_addr_port");
+
+ lock_sock(ssk);
+ tcp_send_ack(ssk);
+ release_sock(ssk);
+ spin_lock_bh(&msk->pm.lock);
+
+ add_addr = READ_ONCE(msk->pm.addr_signal);
+ if (mptcp_pm_should_add_signal_ipv6(msk))
+ add_addr &= ~BIT(MPTCP_ADD_ADDR_IPV6);
+ if (mptcp_pm_should_add_signal_port(msk))
+ add_addr &= ~BIT(MPTCP_ADD_ADDR_PORT);
+ WRITE_ONCE(msk->pm.addr_signal, add_addr);
+ }
+}
+
+void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = (struct sock *)msk;
+
+ pr_debug("address rm_id %d", msk->pm.rm_id);
+
+ if (!msk->pm.rm_id)
+ return;
+
+ if (list_empty(&msk->conn_list))
+ return;
+
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
+
+ if (msk->pm.rm_id != subflow->remote_id)
+ continue;
+
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, how);
+ __mptcp_close_ssk(sk, ssk, subflow);
+ spin_lock_bh(&msk->pm.lock);
+
+ msk->pm.add_addr_accepted--;
+ msk->pm.subflows--;
+ WRITE_ONCE(msk->pm.accept_addr, true);
+
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMADDR);
+
+ break;
+ }
+}
+
+void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = (struct sock *)msk;
+
+ pr_debug("subflow rm_id %d", rm_id);
+
+ if (!rm_id)
+ return;
+
+ if (list_empty(&msk->conn_list))
+ return;
+
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
+
+ if (rm_id != subflow->local_id)
+ continue;
+
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, how);
+ __mptcp_close_ssk(sk, ssk, subflow);
+ spin_lock_bh(&msk->pm.lock);
+
+ msk->pm.local_addr_used--;
+ msk->pm.subflows--;
+
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
+
+ break;
+ }
}
static bool address_use_port(struct mptcp_pm_addr_entry *entry)
{
- return (entry->flags &
+ return (entry->addr.flags &
(MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
MPTCP_PM_ADDR_FLAG_SIGNAL;
}
@@ -303,9 +539,9 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
goto out;
}
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
pernet->add_addr_signal_max++;
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
pernet->local_addr_max++;
entry->addr.id = pernet->next_id++;
@@ -358,8 +594,9 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
if (!entry)
return -ENOMEM;
- entry->flags = 0;
entry->addr = skc_local;
+ entry->addr.ifindex = 0;
+ entry->addr.flags = 0;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0)
kfree(entry);
@@ -397,8 +634,8 @@ mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = {
[MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
[MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
[MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
- [MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct in6_addr), },
+ [MPTCP_PM_ADDR_ATTR_ADDR6] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
[MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 },
[MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 },
[MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 },
@@ -473,14 +710,17 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]);
skip_family:
- if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX])
- entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+ if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) {
+ u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+
+ entry->addr.ifindex = val;
+ }
if (tb[MPTCP_PM_ADDR_ATTR_ID])
entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
- entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
+ entry->addr.flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
return 0;
}
@@ -530,6 +770,68 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
return NULL;
}
+static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *entry;
+
+ entry = mptcp_pm_del_add_timer(msk, addr);
+ if (entry) {
+ list_del(&entry->list);
+ kfree(entry);
+ return true;
+ }
+
+ return false;
+}
+
+static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ bool force)
+{
+ bool ret;
+
+ ret = remove_anno_list_by_saddr(msk, addr);
+ if (ret || force) {
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_remove_addr(msk, addr->id);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+ return ret;
+}
+
+static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_sock *msk;
+ long s_slot = 0, s_num = 0;
+
+ pr_debug("remove_id=%d", addr->id);
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+ bool remove_subflow;
+
+ if (list_empty(&msk->conn_list)) {
+ mptcp_pm_remove_anno_addr(msk, addr, false);
+ goto next;
+ }
+
+ lock_sock(sk);
+ remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr);
+ mptcp_pm_remove_anno_addr(msk, addr, remove_subflow);
+ if (remove_subflow)
+ mptcp_pm_remove_subflow(msk, addr->id);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return 0;
+}
+
static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
@@ -545,29 +847,32 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
entry = __lookup_addr_by_id(pernet, addr.addr.id);
if (!entry) {
GENL_SET_ERR_MSG(info, "address not found");
- ret = -EINVAL;
- goto out;
+ spin_unlock_bh(&pernet->lock);
+ return -EINVAL;
}
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
pernet->add_addr_signal_max--;
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
pernet->local_addr_max--;
pernet->addrs--;
list_del_rcu(&entry->list);
- kfree_rcu(entry, rcu);
-out:
spin_unlock_bh(&pernet->lock);
+
+ mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr);
+ kfree_rcu(entry, rcu);
+
return ret;
}
-static void __flush_addrs(struct pm_nl_pernet *pernet)
+static void __flush_addrs(struct net *net, struct list_head *list)
{
- while (!list_empty(&pernet->local_addr_list)) {
+ while (!list_empty(list)) {
struct mptcp_pm_addr_entry *cur;
- cur = list_entry(pernet->local_addr_list.next,
+ cur = list_entry(list->next,
struct mptcp_pm_addr_entry, list);
+ mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr);
list_del_rcu(&cur->list);
kfree_rcu(cur, rcu);
}
@@ -584,11 +889,13 @@ static void __reset_counters(struct pm_nl_pernet *pernet)
static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ LIST_HEAD(free_list);
spin_lock_bh(&pernet->lock);
- __flush_addrs(pernet);
+ list_splice_init(&pernet->local_addr_list, &free_list);
__reset_counters(pernet);
spin_unlock_bh(&pernet->lock);
+ __flush_addrs(sock_net(skb->sk), &free_list);
return 0;
}
@@ -606,10 +913,10 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb,
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
goto nla_put_failure;
- if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags))
+ if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->addr.flags))
goto nla_put_failure;
- if (entry->ifindex &&
- nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
+ if (entry->addr.ifindex &&
+ nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->addr.ifindex))
goto nla_put_failure;
if (addr->family == AF_INET &&
@@ -789,7 +1096,7 @@ fail:
return -EMSGSIZE;
}
-static struct genl_ops mptcp_pm_ops[] = {
+static const struct genl_small_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_ADD_ADDR,
.doit = mptcp_nl_cmd_add_addr,
@@ -828,8 +1135,8 @@ static struct genl_family mptcp_genl_family __ro_after_init = {
.policy = mptcp_pm_policy,
.netnsok = true,
.module = THIS_MODULE,
- .ops = mptcp_pm_ops,
- .n_ops = ARRAY_SIZE(mptcp_pm_ops),
+ .small_ops = mptcp_pm_ops,
+ .n_small_ops = ARRAY_SIZE(mptcp_pm_ops),
.mcgrps = mptcp_pm_mcgrps,
.n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
};
@@ -850,10 +1157,12 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list)
struct net *net;
list_for_each_entry(net, net_list, exit_list) {
+ struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id);
+
/* net is removed from namespace list, can't race with
* other modifiers
*/
- __flush_addrs(net_generic(net, pm_nl_pernet_id));
+ __flush_addrs(net, &pernet->local_addr_list);
}
}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5d747c6a610e..f998a077c7dd 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,11 +21,10 @@
#include <net/transp_v6.h>
#endif
#include <net/mptcp.h>
+#include <net/xfrm.h>
#include "protocol.h"
#include "mib.h"
-#define MPTCP_SAME_STATE TCP_MAX_STATES
-
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
struct mptcp6_sock {
struct mptcp_sock msk;
@@ -34,6 +33,8 @@ struct mptcp6_sock {
#endif
struct mptcp_skb_cb {
+ u64 map_seq;
+ u64 end_seq;
u32 offset;
};
@@ -41,6 +42,9 @@ struct mptcp_skb_cb {
static struct percpu_counter mptcp_sockets_allocated;
+static void __mptcp_destroy_sock(struct sock *sk);
+static void __mptcp_check_send_data_fin(struct sock *sk);
+
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket.
* Otherwise return NULL.
@@ -53,6 +57,12 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
return msk->subflow;
}
+/* Returns end sequence number of the receiver's advertised window */
+static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->wnd_end);
+}
+
static bool mptcp_is_tcpsk(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
@@ -102,6 +112,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
msk->subflow = ssock;
subflow = mptcp_subflow_ctx(ssock->sk);
list_add(&subflow->node, &msk->conn_list);
+ sock_hold(ssock->sk);
subflow->request_mptcp = 1;
/* accept() will wait on first subflow sk_wq, and we always wakes up
@@ -112,35 +123,208 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
return 0;
}
-static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
- struct sk_buff *skb,
- unsigned int offset, size_t copy_len)
+static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+}
+
+static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ bool fragstolen;
+ int delta;
+
+ if (MPTCP_SKB_CB(from)->offset ||
+ !skb_try_coalesce(to, from, &fragstolen, &delta))
+ return false;
+
+ pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
+ MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
+ to->len, MPTCP_SKB_CB(from)->end_seq);
+ MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
+ kfree_skb_partial(from, fragstolen);
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ return true;
+}
+
+static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq)
+ return false;
+
+ return mptcp_try_coalesce((struct sock *)msk, to, from);
+}
+
+/* "inspired" by tcp_data_queue_ofo(), main differences:
+ * - use mptcp seqs
+ * - don't cope with sacks
+ */
+static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
{
struct sock *sk = (struct sock *)msk;
- struct sk_buff *tail;
+ struct rb_node **p, *parent;
+ u64 seq, end_seq, max_seq;
+ struct sk_buff *skb1;
+
+ seq = MPTCP_SKB_CB(skb)->map_seq;
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ max_seq = READ_ONCE(msk->rcv_wnd_sent);
+
+ pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
+ RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ if (after64(end_seq, max_seq)) {
+ /* out of window */
+ mptcp_drop(sk, skb);
+ pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
+ (unsigned long long)end_seq - (unsigned long)max_seq,
+ (unsigned long long)msk->rcv_wnd_sent);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
+ return;
+ }
- __skb_unlink(skb, &ssk->sk_receive_queue);
+ p = &msk->out_of_order_queue.rb_node;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE);
+ if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
+ rb_link_node(&skb->rbnode, NULL, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
+ msk->ooo_last_skb = skb;
+ goto end;
+ }
- skb_ext_reset(skb);
- skb_orphan(skb);
- WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
+ /* with 2 subflows, adding at end of ooo queue is quite likely
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ return;
+ }
- tail = skb_peek_tail(&sk->sk_receive_queue);
- if (offset == 0 && tail) {
- bool fragstolen;
- int delta;
+ /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ parent = &msk->ooo_last_skb->rbnode;
+ p = &parent->rb_right;
+ goto insert;
+ }
- if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
- kfree_skb_partial(skb, fragstolen);
- atomic_add(delta, &sk->sk_rmem_alloc);
- sk_mem_charge(sk, delta);
+ /* Find place to insert this segment. Handle overlaps on the way. */
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ p = &parent->rb_left;
+ continue;
+ }
+ if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ return;
+ }
+ if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ /* partial overlap:
+ * | skb |
+ * | skb1 |
+ * continue traversing
+ */
+ } else {
+ /* skb's seq == skb1's seq and skb covers skb1.
+ * Replace skb1 with skb.
+ */
+ rb_replace_node(&skb1->rbnode, &skb->rbnode,
+ &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ goto merge_right;
+ }
+ } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
return;
}
+ p = &parent->rb_right;
}
+insert:
+ /* Insert segment into RB tree. */
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
+
+merge_right:
+ /* Remove other segments covered by skb. */
+ while ((skb1 = skb_rb_next(skb)) != NULL) {
+ if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
+ break;
+ rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ }
+ /* If there is no skb after us, we are the last_skb ! */
+ if (!skb1)
+ msk->ooo_last_skb = skb;
+
+end:
+ skb_condense(skb);
skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+}
+
+static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb, unsigned int offset,
+ size_t copy_len)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *tail;
+
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+
+ skb_ext_reset(skb);
+ skb_orphan(skb);
+
+ /* try to fetch required memory from subflow */
+ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
+ if (ssk->sk_forward_alloc < skb->truesize)
+ goto drop;
+ __sk_mem_reclaim(ssk, skb->truesize);
+ if (!sk_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+ }
+
+ /* the skb map_seq accounts for the skb offset:
+ * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
+ * value
+ */
+ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
MPTCP_SKB_CB(skb)->offset = offset;
+
+ if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
+ /* in sequence */
+ WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (tail && mptcp_try_coalesce(sk, tail, skb))
+ return true;
+
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ return true;
+ } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
+ mptcp_data_queue_ofo(msk, skb);
+ return false;
+ }
+
+ /* old data, keep it simple and drop the whole pkt, sender
+ * will retransmit as needed, if needed.
+ */
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+drop:
+ mptcp_drop(sk, skb);
+ return false;
}
static void mptcp_stop_timer(struct sock *sk)
@@ -151,38 +335,35 @@ static void mptcp_stop_timer(struct sock *sk)
mptcp_sk(sk)->timer_ival = 0;
}
-/* both sockets must be locked */
-static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
- struct sock *ssk)
+static void mptcp_close_wake_up(struct sock *sk)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- u64 dsn = mptcp_subflow_get_mapped_dsn(subflow);
+ if (sock_flag(sk, SOCK_DEAD))
+ return;
- /* revalidate data sequence number.
- *
- * mptcp_subflow_data_available() is usually called
- * without msk lock. Its unlikely (but possible)
- * that msk->ack_seq has been advanced since the last
- * call found in-sequence data.
- */
- if (likely(dsn == msk->ack_seq))
- return true;
+ sk->sk_state_change(sk);
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ sk->sk_state == TCP_CLOSE)
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ else
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+}
+
+static bool mptcp_pending_data_fin_ack(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
- subflow->data_avail = 0;
- return mptcp_subflow_data_available(ssk);
+ return !__mptcp_check_fallback(msk) &&
+ ((1 << sk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
+ msk->write_seq == READ_ONCE(msk->snd_una);
}
static void mptcp_check_data_fin_ack(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- if (__mptcp_check_fallback(msk))
- return;
-
/* Look for an acknowledged DATA_FIN */
- if (((1 << sk->sk_state) &
- (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
- msk->write_seq == atomic64_read(&msk->snd_una)) {
+ if (mptcp_pending_data_fin_ack(sk)) {
mptcp_stop_timer(sk);
WRITE_ONCE(msk->snd_data_fin_enable, 0);
@@ -190,20 +371,14 @@ static void mptcp_check_data_fin_ack(struct sock *sk)
switch (sk->sk_state) {
case TCP_FIN_WAIT1:
inet_sk_state_store(sk, TCP_FIN_WAIT2);
- sk->sk_state_change(sk);
break;
case TCP_CLOSING:
case TCP_LAST_ACK:
inet_sk_state_store(sk, TCP_CLOSE);
- sk->sk_state_change(sk);
break;
}
- if (sk->sk_shutdown == SHUTDOWN_MASK ||
- sk->sk_state == TCP_CLOSE)
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
- else
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ mptcp_close_wake_up(sk);
}
}
@@ -237,13 +412,79 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
}
-static void mptcp_check_data_fin(struct sock *sk)
+static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
+{
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
+ if (subflow->request_join && !subflow->fully_established)
+ return false;
+
+ /* only send if our side has not closed yet */
+ return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
+}
+
+static bool tcp_can_send_ack(const struct sock *ssk)
+{
+ return !((1 << inet_sk_state_load(ssk)) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
+}
+
+static void mptcp_send_ack(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+ if (tcp_can_send_ack(ssk))
+ tcp_send_ack(ssk);
+ release_sock(ssk);
+ }
+}
+
+static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk)
+{
+ int ret;
+
+ lock_sock(ssk);
+ ret = tcp_can_send_ack(ssk);
+ if (ret)
+ tcp_cleanup_rbuf(ssk, 1);
+ release_sock(ssk);
+ return ret;
+}
+
+static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
+{
+ struct sock *ack_hint = READ_ONCE(msk->ack_hint);
+ struct mptcp_subflow_context *subflow;
+
+ /* if the hinted ssk is still active, try to use it */
+ if (likely(ack_hint)) {
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk))
+ return;
+ }
+ }
+
+ /* otherwise pick the first active subflow */
+ mptcp_for_each_subflow(msk, subflow)
+ if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow)))
+ return;
+}
+
+static bool mptcp_check_data_fin(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
u64 rcv_data_fin_seq;
+ bool ret = false;
if (__mptcp_check_fallback(msk) || !msk->first)
- return;
+ return ret;
/* Need to ack a DATA_FIN received from a peer while this side
* of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
@@ -259,8 +500,6 @@ static void mptcp_check_data_fin(struct sock *sk)
*/
if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) {
- struct mptcp_subflow_context *subflow;
-
WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1);
WRITE_ONCE(msk->rcv_data_fin, 0);
@@ -277,7 +516,6 @@ static void mptcp_check_data_fin(struct sock *sk)
break;
case TCP_FIN_WAIT2:
inet_sk_state_store(sk, TCP_CLOSE);
- // @@ Close subflows now?
break;
default:
/* Other states not expected */
@@ -285,23 +523,12 @@ static void mptcp_check_data_fin(struct sock *sk)
break;
}
+ ret = true;
mptcp_set_timeout(sk, NULL);
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- lock_sock(ssk);
- tcp_send_ack(ssk);
- release_sock(ssk);
- }
-
- sk->sk_state_change(sk);
-
- if (sk->sk_shutdown == SHUTDOWN_MASK ||
- sk->sk_state == TCP_CLOSE)
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
- else
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ mptcp_send_ack(msk);
+ mptcp_close_wake_up(sk);
}
+ return ret;
}
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
@@ -314,12 +541,20 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
bool more_data_avail;
struct tcp_sock *tp;
bool done = false;
+ int sk_rbuf;
- if (!mptcp_subflow_dsn_valid(msk, ssk)) {
- *bytes = 0;
- return false;
+ sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
+
+ if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
+
+ if (unlikely(ssk_rbuf > sk_rbuf)) {
+ WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf);
+ sk_rbuf = ssk_rbuf;
+ }
}
+ pr_debug("msk=%p ssk=%p", msk, ssk);
tp = tcp_sk(ssk);
do {
u32 map_remaining, offset;
@@ -332,8 +567,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
mptcp_subflow_get_map_offset(subflow);
skb = skb_peek(&ssk->sk_receive_queue);
- if (!skb)
+ if (!skb) {
+ /* if no data is found, a racing workqueue/recvmsg
+ * already processed the new data, stop here or we
+ * can enter an infinite loop
+ */
+ if (!moved)
+ done = true;
break;
+ }
if (__mptcp_check_fallback(msk)) {
/* if we are running under the workqueue, TCP could have
@@ -357,9 +599,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
if (tp->urg_data)
done = true;
- __mptcp_move_skb(msk, ssk, skb, offset, len);
+ if (__mptcp_move_skb(msk, ssk, skb, offset, len))
+ moved += len;
seq += len;
- moved += len;
if (WARN_ON_ONCE(map_remaining < len))
break;
@@ -372,78 +614,125 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
WRITE_ONCE(tp->copied_seq, seq);
more_data_avail = mptcp_subflow_data_available(ssk);
- if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) {
+ if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) {
done = true;
break;
}
} while (more_data_avail);
+ WRITE_ONCE(msk->ack_hint, ssk);
- *bytes = moved;
+ *bytes += moved;
+ return done;
+}
- /* If the moves have caught up with the DATA_FIN sequence number
- * it's time to ack the DATA_FIN and change socket state, but
- * this is not a good place to change state. Let the workqueue
- * do it.
- */
- if (mptcp_pending_data_fin(sk, NULL) &&
- schedule_work(&msk->work))
- sock_hold(sk);
+static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *skb, *tail;
+ bool moved = false;
+ struct rb_node *p;
+ u64 end_seq;
+
+ p = rb_first(&msk->out_of_order_queue);
+ pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ while (p) {
+ skb = rb_to_skb(p);
+ if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
+ break;
- return done;
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &msk->out_of_order_queue);
+
+ if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq,
+ msk->ack_seq))) {
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ continue;
+ }
+
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) {
+ int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+
+ /* skip overlapping data, if any */
+ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
+ MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
+ delta);
+ MPTCP_SKB_CB(skb)->offset += delta;
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+ msk->ack_seq = end_seq;
+ moved = true;
+ }
+ return moved;
}
/* In most cases we will be able to lock the mptcp socket. If its already
* owned, we need to defer to the work queue to avoid ABBA deadlock.
*/
-static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
+static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
{
struct sock *sk = (struct sock *)msk;
unsigned int moved = 0;
- if (READ_ONCE(sk->sk_lock.owned))
- return false;
-
- if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock)))
- return false;
+ if (inet_sk_state_load(sk) == TCP_CLOSE)
+ return;
- /* must re-check after taking the lock */
- if (!READ_ONCE(sk->sk_lock.owned))
- __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ mptcp_data_lock(sk);
- spin_unlock_bh(&sk->sk_lock.slock);
+ __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ __mptcp_ofo_queue(msk);
- return moved > 0;
+ /* If the moves have caught up with the DATA_FIN sequence number
+ * it's time to ack the DATA_FIN and change socket state, but
+ * this is not a good place to change state. Let the workqueue
+ * do it.
+ */
+ if (mptcp_pending_data_fin(sk, NULL))
+ mptcp_schedule_work(sk);
+ mptcp_data_unlock(sk);
}
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(sk);
+ int sk_rbuf, ssk_rbuf;
+ bool wake;
- set_bit(MPTCP_DATA_READY, &msk->flags);
+ /* The peer can send data while we are shutting down this
+ * subflow at msk destruction time, but we must avoid enqueuing
+ * more data to the msk receive queue
+ */
+ if (unlikely(subflow->disposable))
+ return;
- if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) &&
- move_skbs_to_msk(msk, ssk))
- goto wake;
+ /* move_skbs_to_msk below can legitly clear the data_avail flag,
+ * but we will need later to properly woke the reader, cache its
+ * value
+ */
+ wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
+ if (wake)
+ set_bit(MPTCP_DATA_READY, &msk->flags);
- /* don't schedule if mptcp sk is (still) over limit */
- if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
+ ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
+ sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
+ if (unlikely(ssk_rbuf > sk_rbuf))
+ sk_rbuf = ssk_rbuf;
+
+ /* over limit? can't append more skbs to msk */
+ if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
goto wake;
- /* mptcp socket is owned, release_cb should retry */
- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
- &sk->sk_tsq_flags)) {
- sock_hold(sk);
+ move_skbs_to_msk(msk, ssk);
- /* need to try again, its possible release_cb() has already
- * been called after the test_and_set_bit() above.
- */
- move_skbs_to_msk(msk, ssk);
- }
wake:
- sk->sk_data_ready(sk);
+ if (wake)
+ sk->sk_data_ready(sk);
}
-static void __mptcp_flush_join_list(struct mptcp_sock *msk)
+void __mptcp_flush_join_list(struct mptcp_sock *msk)
{
if (likely(list_empty(&msk->join_list)))
return;
@@ -463,6 +752,10 @@ static void mptcp_reset_timer(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned long tout;
+ /* prevent rescheduling on close */
+ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
+ return;
+
/* should never be called with mptcp level timer cleared */
tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
if (WARN_ON_ONCE(!tout))
@@ -470,23 +763,23 @@ static void mptcp_reset_timer(struct sock *sk)
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
}
-void mptcp_data_acked(struct sock *sk)
+bool mptcp_schedule_work(struct sock *sk)
{
- mptcp_reset_timer(sk);
-
- if ((!sk_stream_is_writeable(sk) ||
- (inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
- schedule_work(&mptcp_sk(sk)->work))
+ if (inet_sk_state_load(sk) != TCP_CLOSE &&
+ schedule_work(&mptcp_sk(sk)->work)) {
+ /* each subflow already holds a reference to the sk, and the
+ * workqueue is invoked by a subflow, so sk can't go away here.
+ */
sock_hold(sk);
+ return true;
+ }
+ return false;
}
void mptcp_subflow_eof(struct sock *sk)
{
- struct mptcp_sock *msk = mptcp_sk(sk);
-
- if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) &&
- schedule_work(&msk->work))
- sock_hold(sk);
+ if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags))
+ mptcp_schedule_work(sk);
}
static void mptcp_check_for_eof(struct mptcp_sock *msk)
@@ -497,8 +790,10 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk)
mptcp_for_each_subflow(msk, subflow)
receivers += !subflow->rx_eof;
+ if (receivers)
+ return;
- if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ if (!(sk->sk_shutdown & RCV_SHUTDOWN)) {
/* hopefully temporary hack: propagate shutdown status
* to msk, when all subflows agree on it
*/
@@ -508,16 +803,21 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk)
set_bit(MPTCP_DATA_READY, &msk->flags);
sk->sk_data_ready(sk);
}
-}
-
-static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
-{
- const struct sock *sk = (const struct sock *)msk;
- if (!msk->cached_ext)
- msk->cached_ext = __skb_ext_alloc(sk->sk_allocation);
-
- return !!msk->cached_ext;
+ switch (sk->sk_state) {
+ case TCP_ESTABLISHED:
+ inet_sk_state_store(sk, TCP_CLOSE_WAIT);
+ break;
+ case TCP_FIN_WAIT1:
+ inet_sk_state_store(sk, TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
+ inet_sk_state_store(sk, TCP_CLOSE);
+ break;
+ default:
+ return;
+ }
+ mptcp_close_wake_up(sk);
}
static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
@@ -542,8 +842,11 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq,
if (!tcp_skb_can_collapse_to(skb))
return false;
- /* can collapse only if MPTCP level sequence is in order */
- return mpext && mpext->data_seq + mpext->data_len == write_seq;
+ /* can collapse only if MPTCP level sequence is in order and this
+ * mapping has not been xmitted yet
+ */
+ return mpext && mpext->data_seq + mpext->data_len == write_seq &&
+ !mpext->frozen;
}
static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
@@ -551,9 +854,128 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
const struct mptcp_data_frag *df)
{
return df && pfrag->page == df->page &&
+ pfrag->size - pfrag->offset > 0 &&
df->data_seq + df->data_len == msk->write_seq;
}
+static int mptcp_wmem_with_overhead(struct sock *sk, int size)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int ret, skbs;
+
+ ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
+ skbs = (msk->tx_pending_data + size) / msk->size_goal_cache;
+ if (skbs < msk->skb_tx_cache.qlen)
+ return ret;
+
+ return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER);
+}
+
+static void __mptcp_wmem_reserve(struct sock *sk, int size)
+{
+ int amount = mptcp_wmem_with_overhead(sk, size);
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ WARN_ON_ONCE(msk->wmem_reserved);
+ if (WARN_ON_ONCE(amount < 0))
+ amount = 0;
+
+ if (amount <= sk->sk_forward_alloc)
+ goto reserve;
+
+ /* under memory pressure try to reserve at most a single page
+ * otherwise try to reserve the full estimate and fallback
+ * to a single page before entering the error path
+ */
+ if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
+ !sk_wmem_schedule(sk, amount)) {
+ if (amount <= PAGE_SIZE)
+ goto nomem;
+
+ amount = PAGE_SIZE;
+ if (!sk_wmem_schedule(sk, amount))
+ goto nomem;
+ }
+
+reserve:
+ msk->wmem_reserved = amount;
+ sk->sk_forward_alloc -= amount;
+ return;
+
+nomem:
+ /* we will wait for memory on next allocation */
+ msk->wmem_reserved = -1;
+}
+
+static void __mptcp_update_wmem(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (!msk->wmem_reserved)
+ return;
+
+ if (msk->wmem_reserved < 0)
+ msk->wmem_reserved = 0;
+ if (msk->wmem_reserved > 0) {
+ sk->sk_forward_alloc += msk->wmem_reserved;
+ msk->wmem_reserved = 0;
+ }
+}
+
+static bool mptcp_wmem_alloc(struct sock *sk, int size)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ /* check for pre-existing error condition */
+ if (msk->wmem_reserved < 0)
+ return false;
+
+ if (msk->wmem_reserved >= size)
+ goto account;
+
+ mptcp_data_lock(sk);
+ if (!sk_wmem_schedule(sk, size)) {
+ mptcp_data_unlock(sk);
+ return false;
+ }
+
+ sk->sk_forward_alloc -= size;
+ msk->wmem_reserved += size;
+ mptcp_data_unlock(sk);
+
+account:
+ msk->wmem_reserved -= size;
+ return true;
+}
+
+static void mptcp_wmem_uncharge(struct sock *sk, int size)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (msk->wmem_reserved < 0)
+ msk->wmem_reserved = 0;
+ msk->wmem_reserved += size;
+}
+
+static void mptcp_mem_reclaim_partial(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ /* if we are experiencing a transint allocation error,
+ * the forward allocation memory has been already
+ * released
+ */
+ if (msk->wmem_reserved < 0)
+ return;
+
+ mptcp_data_lock(sk);
+ sk->sk_forward_alloc += msk->wmem_reserved;
+ sk_mem_reclaim_partial(sk);
+ msk->wmem_reserved = sk->sk_forward_alloc;
+ sk->sk_forward_alloc = 0;
+ mptcp_data_unlock(sk);
+}
+
static void dfrag_uncharge(struct sock *sk, int len)
{
sk_mem_uncharge(sk, len);
@@ -569,7 +991,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
put_page(dfrag->page);
}
-static void mptcp_clean_una(struct sock *sk)
+static void __mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
@@ -580,13 +1002,15 @@ static void mptcp_clean_una(struct sock *sk)
* plain TCP
*/
if (__mptcp_check_fallback(msk))
- atomic64_set(&msk->snd_una, msk->write_seq);
- snd_una = atomic64_read(&msk->snd_una);
+ msk->snd_una = READ_ONCE(msk->snd_nxt);
+ snd_una = msk->snd_una;
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
break;
+ if (WARN_ON_ONCE(dfrag == msk->first_pending))
+ break;
dfrag_clear(sk, dfrag);
cleaned = true;
}
@@ -595,12 +1019,13 @@ static void mptcp_clean_una(struct sock *sk)
if (dfrag && after64(snd_una, dfrag->data_seq)) {
u64 delta = snd_una - dfrag->data_seq;
- if (WARN_ON_ONCE(delta > dfrag->data_len))
+ if (WARN_ON_ONCE(delta > dfrag->already_sent))
goto out;
dfrag->data_seq += delta;
dfrag->offset += delta;
dfrag->data_len -= delta;
+ dfrag->already_sent -= delta;
dfrag_uncharge(sk, delta);
cleaned = true;
@@ -608,11 +1033,41 @@ static void mptcp_clean_una(struct sock *sk)
out:
if (cleaned) {
- sk_mem_reclaim_partial(sk);
+ if (tcp_under_memory_pressure(sk)) {
+ __mptcp_update_wmem(sk);
+ sk_mem_reclaim_partial(sk);
+ }
- /* Only wake up writers if a subflow is ready */
- if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
- sk_stream_write_space(sk);
+ if (sk_stream_is_writeable(sk)) {
+ /* pairs with memory barrier in mptcp_poll */
+ smp_mb();
+ if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags))
+ sk_stream_write_space(sk);
+ }
+ }
+
+ if (snd_una == READ_ONCE(msk->snd_nxt)) {
+ if (msk->timer_ival)
+ mptcp_stop_timer(sk);
+ } else {
+ mptcp_reset_timer(sk);
+ }
+}
+
+static void mptcp_enter_memory_pressure(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ bool first = true;
+
+ sk_stream_moderate_sndbuf(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (first)
+ tcp_enter_memory_pressure(ssk);
+ sk_stream_moderate_sndbuf(ssk);
+ first = false;
}
}
@@ -625,8 +1080,7 @@ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
pfrag, sk->sk_allocation)))
return true;
- sk->sk_prot->enter_memory_pressure(sk);
- sk_stream_moderate_sndbuf(sk);
+ mptcp_enter_memory_pressure(sk);
return false;
}
@@ -642,149 +1096,241 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
dfrag->data_seq = msk->write_seq;
dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
dfrag->offset = offset + sizeof(struct mptcp_data_frag);
+ dfrag->already_sent = 0;
dfrag->page = pfrag->page;
return dfrag;
}
-static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
- struct msghdr *msg, struct mptcp_data_frag *dfrag,
- long *timeo, int *pmss_now,
- int *ps_goal)
+struct mptcp_sendmsg_info {
+ int mss_now;
+ int size_goal;
+ u16 limit;
+ u16 sent;
+ unsigned int flags;
+};
+
+static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq,
+ int avail_size)
+{
+ u64 window_end = mptcp_wnd_end(msk);
+
+ if (__mptcp_check_fallback(msk))
+ return avail_size;
+
+ if (!before64(data_seq + avail_size, window_end)) {
+ u64 allowed_size = window_end - data_seq;
+
+ return min_t(unsigned int, allowed_size, avail_size);
+ }
+
+ return avail_size;
+}
+
+static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp)
+{
+ struct skb_ext *mpext = __skb_ext_alloc(gfp);
+
+ if (!mpext)
+ return false;
+ __skb_ext_set(skb, SKB_EXT_MPTCP, mpext);
+ return true;
+}
+
+static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
+ if (likely(skb)) {
+ if (likely(__mptcp_add_ext(skb, gfp))) {
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->reserved_tailroom = skb->end - skb->tail;
+ return skb;
+ }
+ __kfree_skb(skb);
+ } else {
+ mptcp_enter_memory_pressure(sk);
+ }
+ return NULL;
+}
+
+static bool mptcp_tx_cache_refill(struct sock *sk, int size,
+ struct sk_buff_head *skbs, int *total_ts)
{
- int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
- bool dfrag_collapsed, can_collapse = false;
struct mptcp_sock *msk = mptcp_sk(sk);
- struct mptcp_ext *mpext = NULL;
- bool retransmission = !!dfrag;
- struct sk_buff *skb, *tail;
- struct page_frag *pfrag;
- struct page *page;
- u64 *write_seq;
- size_t psize;
-
- /* use the mptcp page cache so that we can easily move the data
- * from one substream to another, but do per subflow memory accounting
- * Note: pfrag is used only !retransmission, but the compiler if
- * fooled into a warning if we don't init here
- */
- pfrag = sk_page_frag(sk);
- if (!retransmission) {
- write_seq = &msk->write_seq;
- page = pfrag->page;
+ struct sk_buff *skb;
+ int space_needed;
+
+ if (unlikely(tcp_under_memory_pressure(sk))) {
+ mptcp_mem_reclaim_partial(sk);
+
+ /* under pressure pre-allocate at most a single skb */
+ if (msk->skb_tx_cache.qlen)
+ return true;
+ space_needed = msk->size_goal_cache;
} else {
- write_seq = &dfrag->data_seq;
- page = dfrag->page;
+ space_needed = msk->tx_pending_data + size -
+ msk->skb_tx_cache.qlen * msk->size_goal_cache;
}
- /* compute copy limit */
- mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
- *pmss_now = mss_now;
- *ps_goal = size_goal;
- avail_size = size_goal;
- skb = tcp_write_queue_tail(ssk);
+ while (space_needed > 0) {
+ skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ /* under memory pressure, try to pass the caller a
+ * single skb to allow forward progress
+ */
+ while (skbs->qlen > 1) {
+ skb = __skb_dequeue_tail(skbs);
+ __kfree_skb(skb);
+ }
+ return skbs->qlen > 0;
+ }
+
+ *total_ts += skb->truesize;
+ __skb_queue_tail(skbs, skb);
+ space_needed -= msk->size_goal_cache;
+ }
+ return true;
+}
+
+static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (ssk->sk_tx_skb_cache) {
+ skb = ssk->sk_tx_skb_cache;
+ if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) &&
+ !__mptcp_add_ext(skb, gfp)))
+ return false;
+ return true;
+ }
+
+ skb = skb_peek(&msk->skb_tx_cache);
if (skb) {
- mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+ if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
+ skb = __skb_dequeue(&msk->skb_tx_cache);
+ if (WARN_ON_ONCE(!skb))
+ return false;
+
+ mptcp_wmem_uncharge(sk, skb->truesize);
+ ssk->sk_tx_skb_cache = skb;
+ return true;
+ }
+
+ /* over memory limit, no point to try to allocate a new skb */
+ return false;
+ }
+ skb = __mptcp_do_alloc_tx_skb(sk, gfp);
+ if (!skb)
+ return false;
+
+ if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
+ ssk->sk_tx_skb_cache = skb;
+ return true;
+ }
+ kfree_skb(skb);
+ return false;
+}
+
+static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk)
+{
+ return !ssk->sk_tx_skb_cache &&
+ !skb_peek(&mptcp_sk(sk)->skb_tx_cache) &&
+ tcp_under_memory_pressure(sk);
+}
+
+static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk)
+{
+ if (unlikely(mptcp_must_reclaim_memory(sk, ssk)))
+ mptcp_mem_reclaim_partial(sk);
+ return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation);
+}
+
+static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
+ struct mptcp_data_frag *dfrag,
+ struct mptcp_sendmsg_info *info)
+{
+ u64 data_seq = dfrag->data_seq + info->sent;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ bool zero_window_probe = false;
+ struct mptcp_ext *mpext = NULL;
+ struct sk_buff *skb, *tail;
+ bool can_collapse = false;
+ int size_bias = 0;
+ int avail_size;
+ size_t ret = 0;
+
+ pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d",
+ msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
+
+ /* compute send limit */
+ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
+ avail_size = info->size_goal;
+ msk->size_goal_cache = info->size_goal;
+ skb = tcp_write_queue_tail(ssk);
+ if (skb) {
/* Limit the write to the size available in the
* current skb, if any, so that we create at most a new skb.
* Explicitly tells TCP internals to avoid collapsing on later
* queue management operation, to avoid breaking the ext <->
* SSN association set here
*/
- can_collapse = (size_goal - skb->len > 0) &&
- mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
- if (!can_collapse)
+ mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+ can_collapse = (info->size_goal - skb->len > 0) &&
+ mptcp_skb_can_collapse_to(data_seq, skb, mpext);
+ if (!can_collapse) {
TCP_SKB_CB(skb)->eor = 1;
- else
- avail_size = size_goal - skb->len;
- }
-
- if (!retransmission) {
- /* reuse tail pfrag, if possible, or carve a new one from the
- * page allocator
- */
- dfrag = mptcp_rtx_tail(sk);
- offset = pfrag->offset;
- dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
- if (!dfrag_collapsed) {
- dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
- offset = dfrag->offset;
- frag_truesize = dfrag->overhead;
- }
- psize = min_t(size_t, pfrag->size - offset, avail_size);
-
- /* Copy to page */
- pr_debug("left=%zu", msg_data_left(msg));
- psize = copy_page_from_iter(pfrag->page, offset,
- min_t(size_t, msg_data_left(msg),
- psize),
- &msg->msg_iter);
- pr_debug("left=%zu", msg_data_left(msg));
- if (!psize)
- return -EINVAL;
-
- if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) {
- iov_iter_revert(&msg->msg_iter, psize);
- return -ENOMEM;
+ } else {
+ size_bias = skb->len;
+ avail_size = info->size_goal - skb->len;
}
- } else {
- offset = dfrag->offset;
- psize = min_t(size_t, dfrag->data_len, avail_size);
}
- /* tell the TCP stack to delay the push so that we can safely
- * access the skb after the sendpages call
- */
- ret = do_tcp_sendpages(ssk, page, offset, psize,
- msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT);
- if (ret <= 0) {
- if (!retransmission)
- iov_iter_revert(&msg->msg_iter, psize);
- return ret;
- }
+ /* Zero window and all data acked? Probe. */
+ avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size);
+ if (avail_size == 0) {
+ u64 snd_una = READ_ONCE(msk->snd_una);
- frag_truesize += ret;
- if (!retransmission) {
- if (unlikely(ret < psize))
- iov_iter_revert(&msg->msg_iter, psize - ret);
+ if (skb || snd_una != msk->snd_nxt)
+ return 0;
+ zero_window_probe = true;
+ data_seq = snd_una - 1;
+ avail_size = 1;
+ }
- /* send successful, keep track of sent data for mptcp-level
- * retransmission
- */
- dfrag->data_len += ret;
- if (!dfrag_collapsed) {
- get_page(dfrag->page);
- list_add_tail(&dfrag->list, &msk->rtx_queue);
- sk_wmem_queued_add(sk, frag_truesize);
- } else {
- sk_wmem_queued_add(sk, ret);
- }
+ if (WARN_ON_ONCE(info->sent > info->limit ||
+ info->limit > dfrag->data_len))
+ return 0;
- /* charge data on mptcp rtx queue to the master socket
- * Note: we charge such data both to sk and ssk
- */
- sk->sk_forward_alloc -= frag_truesize;
+ ret = info->limit - info->sent;
+ tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags,
+ dfrag->page, dfrag->offset + info->sent, &ret);
+ if (!tail) {
+ tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
+ return -ENOMEM;
}
- /* if the tail skb extension is still the cached one, collapsing
- * really happened. Note: we can't check for 'same skb' as the sk_buff
- * hdr on tail can be transmitted, freed and re-allocated by the
- * do_tcp_sendpages() call
+ /* if the tail skb is still the cached one, collapsing really happened.
*/
- tail = tcp_write_queue_tail(ssk);
- if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) {
- WARN_ON_ONCE(!can_collapse);
+ if (skb == tail) {
+ TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH;
mpext->data_len += ret;
+ WARN_ON_ONCE(!can_collapse);
+ WARN_ON_ONCE(zero_window_probe);
goto out;
}
- skb = tcp_write_queue_tail(ssk);
- mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
- msk->cached_ext = NULL;
+ mpext = skb_ext_find(tail, SKB_EXT_MPTCP);
+ if (WARN_ON_ONCE(!mpext)) {
+ /* should never reach here, stream corrupted */
+ return -EINVAL;
+ }
memset(mpext, 0, sizeof(*mpext));
- mpext->data_seq = *write_seq;
+ mpext->data_seq = data_seq;
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
mpext->data_len = ret;
mpext->use_map = 1;
@@ -794,85 +1340,257 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext->data_seq, mpext->subflow_seq, mpext->data_len,
mpext->dsn64);
+ if (zero_window_probe) {
+ mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
+ mpext->frozen = 1;
+ ret = 0;
+ tcp_push_pending_frames(ssk);
+ }
out:
- if (!retransmission)
- pfrag->offset += frag_truesize;
- WRITE_ONCE(*write_seq, *write_seq + ret);
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
-
return ret;
}
-static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
-{
- clear_bit(MPTCP_SEND_SPACE, &msk->flags);
- smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
+#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
+ sizeof(struct tcphdr) - \
+ MAX_TCP_OPTION_SPACE - \
+ sizeof(struct ipv6hdr) - \
+ sizeof(struct frag_hdr))
- /* enables sk->write_space() callbacks */
- set_bit(SOCK_NOSPACE, &sock->flags);
-}
+struct subflow_send_info {
+ struct sock *ssk;
+ u64 ratio;
+};
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
+ u32 *sndbuf)
{
+ struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
- struct sock *backup = NULL;
+ int i, nr_active = 0;
+ struct sock *ssk;
+ u64 ratio;
+ u32 pace;
- sock_owned_by_me((const struct sock *)msk);
+ sock_owned_by_me((struct sock *)msk);
- if (!mptcp_ext_cache_refill(msk))
- return NULL;
+ *sndbuf = 0;
+ if (__mptcp_check_fallback(msk)) {
+ if (!msk->first)
+ return NULL;
+ *sndbuf = msk->first->sk_sndbuf;
+ return sk_stream_memory_free(msk->first) ? msk->first : NULL;
+ }
+
+ /* re-use last subflow, if the burst allow that */
+ if (msk->last_snd && msk->snd_burst > 0 &&
+ sk_stream_memory_free(msk->last_snd) &&
+ mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
+ mptcp_for_each_subflow(msk, subflow) {
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
+ }
+ return msk->last_snd;
+ }
+ /* pick the subflow with the lower wmem/wspace ratio */
+ for (i = 0; i < 2; ++i) {
+ send_info[i].ssk = NULL;
+ send_info[i].ratio = -1;
+ }
mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
- if (!sk_stream_memory_free(ssk)) {
- struct socket *sock = ssk->sk_socket;
+ nr_active += !subflow->backup;
+ *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
+ if (!sk_stream_memory_free(subflow->tcp_sock))
+ continue;
- if (sock)
- mptcp_nospace(msk, sock);
+ pace = READ_ONCE(ssk->sk_pacing_rate);
+ if (!pace)
+ continue;
- return NULL;
+ ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
+ pace);
+ if (ratio < send_info[subflow->backup].ratio) {
+ send_info[subflow->backup].ssk = ssk;
+ send_info[subflow->backup].ratio = ratio;
}
+ }
- if (subflow->backup) {
- if (!backup)
- backup = ssk;
+ pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
+ msk, nr_active, send_info[0].ssk, send_info[0].ratio,
+ send_info[1].ssk, send_info[1].ratio);
- continue;
- }
+ /* pick the best backup if no other subflow is active */
+ if (!nr_active)
+ send_info[0].ssk = send_info[1].ssk;
- return ssk;
+ if (send_info[0].ssk) {
+ msk->last_snd = send_info[0].ssk;
+ msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
+ sk_stream_wspace(msk->last_snd));
+ return msk->last_snd;
}
+ return NULL;
+}
- return backup;
+static void mptcp_push_release(struct sock *sk, struct sock *ssk,
+ struct mptcp_sendmsg_info *info)
+{
+ mptcp_set_timeout(sk, ssk);
+ tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
+ release_sock(ssk);
}
-static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
+static void mptcp_push_pending(struct sock *sk, unsigned int flags)
{
- struct socket *sock;
+ struct sock *prev_ssk = NULL, *ssk = NULL;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_sendmsg_info info = {
+ .flags = flags,
+ };
+ struct mptcp_data_frag *dfrag;
+ int len, copied = 0;
+ u32 sndbuf;
+
+ while ((dfrag = mptcp_send_head(sk))) {
+ info.sent = dfrag->already_sent;
+ info.limit = dfrag->data_len;
+ len = dfrag->data_len - dfrag->already_sent;
+ while (len > 0) {
+ int ret = 0;
+
+ prev_ssk = ssk;
+ __mptcp_flush_join_list(msk);
+ ssk = mptcp_subflow_get_send(msk, &sndbuf);
+
+ /* do auto tuning */
+ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+ sndbuf > READ_ONCE(sk->sk_sndbuf))
+ WRITE_ONCE(sk->sk_sndbuf, sndbuf);
+
+ /* try to keep the subflow socket lock across
+ * consecutive xmit on the same socket
+ */
+ if (ssk != prev_ssk && prev_ssk)
+ mptcp_push_release(sk, prev_ssk, &info);
+ if (!ssk)
+ goto out;
- if (likely(sk_stream_is_writeable(ssk)))
- return;
+ if (ssk != prev_ssk || !prev_ssk)
+ lock_sock(ssk);
+
+ /* keep it simple and always provide a new skb for the
+ * subflow, even if we will not use it when collapsing
+ * on the pending one
+ */
+ if (!mptcp_alloc_tx_skb(sk, ssk)) {
+ mptcp_push_release(sk, ssk, &info);
+ goto out;
+ }
+
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ if (ret <= 0) {
+ mptcp_push_release(sk, ssk, &info);
+ goto out;
+ }
- sock = READ_ONCE(ssk->sk_socket);
- if (sock)
- mptcp_nospace(msk, sock);
+ info.sent += ret;
+ dfrag->already_sent += ret;
+ msk->snd_nxt += ret;
+ msk->snd_burst -= ret;
+ msk->tx_pending_data -= ret;
+ copied += ret;
+ len -= ret;
+ }
+ WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ }
+
+ /* at this point we held the socket lock for the last subflow we used */
+ if (ssk)
+ mptcp_push_release(sk, ssk, &info);
+
+out:
+ if (copied) {
+ /* start the timer, if it's not pending */
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+ __mptcp_check_send_data_fin(sk);
+ }
+}
+
+static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_sendmsg_info info;
+ struct mptcp_data_frag *dfrag;
+ int len, copied = 0;
+
+ info.flags = 0;
+ while ((dfrag = mptcp_send_head(sk))) {
+ info.sent = dfrag->already_sent;
+ info.limit = dfrag->data_len;
+ len = dfrag->data_len - dfrag->already_sent;
+ while (len > 0) {
+ int ret = 0;
+
+ /* do auto tuning */
+ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+ ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf))
+ WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
+
+ if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) {
+ __mptcp_update_wmem(sk);
+ sk_mem_reclaim_partial(sk);
+ }
+ if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC))
+ goto out;
+
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ if (ret <= 0)
+ goto out;
+
+ info.sent += ret;
+ dfrag->already_sent += ret;
+ msk->snd_nxt += ret;
+ msk->snd_burst -= ret;
+ msk->tx_pending_data -= ret;
+ copied += ret;
+ len -= ret;
+ }
+ WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ }
+
+out:
+ /* __mptcp_alloc_tx_skb could have released some wmem and we are
+ * not going to flush it via release_sock()
+ */
+ __mptcp_update_wmem(sk);
+ if (copied) {
+ mptcp_set_timeout(sk, ssk);
+ tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+ info.size_goal);
+ if (msk->snd_data_fin_enable &&
+ msk->snd_nxt + 1 == msk->write_seq)
+ mptcp_schedule_work(sk);
+ }
}
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
- int mss_now = 0, size_goal = 0, ret = 0;
struct mptcp_sock *msk = mptcp_sk(sk);
struct page_frag *pfrag;
size_t copied = 0;
- struct sock *ssk;
- bool tx_ok;
+ int ret = 0;
long timeo;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
return -EOPNOTSUPP;
- lock_sock(sk);
+ mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -883,120 +1601,95 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
pfrag = sk_page_frag(sk);
-restart:
- mptcp_clean_una(sk);
- if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
- ret = -EPIPE;
- goto out;
- }
+ while (msg_data_left(msg)) {
+ int total_ts, frag_truesize = 0;
+ struct mptcp_data_frag *dfrag;
+ struct sk_buff_head skbs;
+ bool dfrag_collapsed;
+ size_t psize, offset;
- __mptcp_flush_join_list(msk);
- ssk = mptcp_subflow_get_send(msk);
- while (!sk_stream_memory_free(sk) ||
- !ssk ||
- !mptcp_page_frag_refill(ssk, pfrag)) {
- if (ssk) {
- /* make sure retransmit timer is
- * running before we wait for memory.
- *
- * The retransmit timer might be needed
- * to make the peer send an up-to-date
- * MPTCP Ack.
- */
- mptcp_set_timeout(sk, ssk);
- if (!mptcp_timer_pending(sk))
- mptcp_reset_timer(sk);
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
+ ret = -EPIPE;
+ goto out;
}
- ret = sk_stream_wait_memory(sk, &timeo);
- if (ret)
- goto out;
+ /* reuse tail pfrag, if possible, or carve a new one from the
+ * page allocator
+ */
+ dfrag = mptcp_pending_tail(sk);
+ dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
+ if (!dfrag_collapsed) {
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_memory;
- mptcp_clean_una(sk);
+ if (!mptcp_page_frag_refill(sk, pfrag))
+ goto wait_for_memory;
- ssk = mptcp_subflow_get_send(msk);
- if (list_empty(&msk->conn_list)) {
- ret = -ENOTCONN;
- goto out;
+ dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset);
+ frag_truesize = dfrag->overhead;
}
- }
-
- pr_debug("conn_list->subflow=%p", ssk);
- lock_sock(ssk);
- tx_ok = msg_data_left(msg);
- while (tx_ok) {
- ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
- &size_goal);
- if (ret < 0) {
- if (ret == -EAGAIN && timeo > 0) {
- mptcp_set_timeout(sk, ssk);
- release_sock(ssk);
- goto restart;
- }
- break;
+ /* we do not bound vs wspace, to allow a single packet.
+ * memory accounting will prevent execessive memory usage
+ * anyway
+ */
+ offset = dfrag->offset + dfrag->data_len;
+ psize = pfrag->size - offset;
+ psize = min_t(size_t, psize, msg_data_left(msg));
+ total_ts = psize + frag_truesize;
+ __skb_queue_head_init(&skbs);
+ if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts))
+ goto wait_for_memory;
+
+ if (!mptcp_wmem_alloc(sk, total_ts)) {
+ __skb_queue_purge(&skbs);
+ goto wait_for_memory;
}
- copied += ret;
-
- tx_ok = msg_data_left(msg);
- if (!tx_ok)
- break;
-
- if (!sk_stream_memory_free(ssk) ||
- !mptcp_page_frag_refill(ssk, pfrag) ||
- !mptcp_ext_cache_refill(msk)) {
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- tcp_push(ssk, msg->msg_flags, mss_now,
- tcp_sk(ssk)->nonagle, size_goal);
- mptcp_set_timeout(sk, ssk);
- release_sock(ssk);
- goto restart;
+ skb_queue_splice_tail(&skbs, &msk->skb_tx_cache);
+ if (copy_page_from_iter(dfrag->page, offset, psize,
+ &msg->msg_iter) != psize) {
+ mptcp_wmem_uncharge(sk, psize + frag_truesize);
+ ret = -EFAULT;
+ goto out;
}
- /* memory is charged to mptcp level socket as well, i.e.
- * if msg is very large, mptcp socket may run out of buffer
- * space. mptcp_clean_una() will release data that has
- * been acked at mptcp level in the mean time, so there is
- * a good chance we can continue sending data right away.
- *
- * Normally, when the tcp subflow can accept more data, then
- * so can the MPTCP socket. However, we need to cope with
- * peers that might lag behind in their MPTCP-level
- * acknowledgements, i.e. data might have been acked at
- * tcp level only. So, we must also check the MPTCP socket
- * limits before we send more data.
+ /* data successfully copied into the write queue */
+ copied += psize;
+ dfrag->data_len += psize;
+ frag_truesize += psize;
+ pfrag->offset += frag_truesize;
+ WRITE_ONCE(msk->write_seq, msk->write_seq + psize);
+ msk->tx_pending_data += psize;
+
+ /* charge data on mptcp pending queue to the msk socket
+ * Note: we charge such data both to sk and ssk
*/
- if (unlikely(!sk_stream_memory_free(sk))) {
- tcp_push(ssk, msg->msg_flags, mss_now,
- tcp_sk(ssk)->nonagle, size_goal);
- mptcp_clean_una(sk);
- if (!sk_stream_memory_free(sk)) {
- /* can't send more for now, need to wait for
- * MPTCP-level ACKs from peer.
- *
- * Wakeup will happen via mptcp_clean_una().
- */
- mptcp_set_timeout(sk, ssk);
- release_sock(ssk);
- goto restart;
- }
+ sk_wmem_queued_add(sk, frag_truesize);
+ if (!dfrag_collapsed) {
+ get_page(dfrag->page);
+ list_add_tail(&dfrag->list, &msk->rtx_queue);
+ if (!msk->first_pending)
+ WRITE_ONCE(msk->first_pending, dfrag);
}
- }
+ pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk,
+ dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
+ !dfrag_collapsed);
- mptcp_set_timeout(sk, ssk);
- if (copied) {
- tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
- size_goal);
+ continue;
- /* start the timer, if it's not pending */
- if (!mptcp_timer_pending(sk))
- mptcp_reset_timer(sk);
+wait_for_memory:
+ set_bit(MPTCP_NOSPACE, &msk->flags);
+ mptcp_push_pending(sk, msg->msg_flags);
+ ret = sk_stream_wait_memory(sk, &timeo);
+ if (ret)
+ goto out;
}
- ssk_check_wmem(msk, ssk);
- release_sock(ssk);
+ if (copied)
+ mptcp_push_pending(sk, msg->msg_flags);
+
out:
release_sock(sk);
return copied ? : ret;
@@ -1021,11 +1714,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
struct msghdr *msg,
size_t len)
{
- struct sock *sk = (struct sock *)msk;
struct sk_buff *skb;
int copied = 0;
- while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ while ((skb = skb_peek(&msk->receive_queue)) != NULL) {
u32 offset = MPTCP_SKB_CB(skb)->offset;
u32 data_len = skb->len - offset;
u32 count = min_t(size_t, len - copied, data_len);
@@ -1045,7 +1737,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
break;
}
- __skb_unlink(skb, &sk->sk_receive_queue);
+ /* we will bulk release the skb memory later */
+ skb->destructor = NULL;
+ msk->rmem_released += skb->truesize;
+ __skb_unlink(skb, &msk->receive_queue);
__kfree_skb(skb);
if (copied >= len)
@@ -1135,10 +1830,14 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
*/
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk;
+ bool slow;
ssk = mptcp_subflow_tcp_sock(subflow);
+ slow = lock_sock_fast(ssk);
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
tcp_sk(ssk)->window_clamp = window_clamp;
+ tcp_cleanup_rbuf(ssk, 1);
+ unlock_sock_fast(ssk, slow);
}
}
}
@@ -1149,23 +1848,68 @@ new_measure:
msk->rcvq_space.time = mstamp;
}
-static bool __mptcp_move_skbs(struct mptcp_sock *msk)
+static void __mptcp_update_rmem(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (!msk->rmem_released)
+ return;
+
+ atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, msk->rmem_released);
+ msk->rmem_released = 0;
+}
+
+static void __mptcp_splice_receive_queue(struct sock *sk)
{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
+}
+
+static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
+{
+ struct sock *sk = (struct sock *)msk;
unsigned int moved = 0;
- bool done;
+ bool ret, done;
+ __mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
+ bool slowpath;
- if (!ssk)
+ /* we can have data pending in the subflows only if the msk
+ * receive buffer was full at subflow_data_ready() time,
+ * that is an unlikely slow path.
+ */
+ if (likely(!ssk))
break;
- lock_sock(ssk);
+ slowpath = lock_sock_fast(ssk);
+ mptcp_data_lock(sk);
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
- release_sock(ssk);
+ mptcp_data_unlock(sk);
+ if (moved && rcv) {
+ WRITE_ONCE(msk->rmem_pending, min(rcv, moved));
+ tcp_cleanup_rbuf(ssk, 1);
+ WRITE_ONCE(msk->rmem_pending, 0);
+ }
+ unlock_sock_fast(ssk, slowpath);
} while (!done);
- return moved > 0;
+ /* acquire the data lock only if some input data is pending */
+ ret = moved > 0;
+ if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) ||
+ !skb_queue_empty_lockless(&sk->sk_receive_queue)) {
+ mptcp_data_lock(sk);
+ __mptcp_update_rmem(sk);
+ ret |= __mptcp_ofo_queue(msk);
+ __mptcp_splice_receive_queue(sk);
+ mptcp_data_unlock(sk);
+ }
+ if (ret)
+ mptcp_check_data_fin((struct sock *)msk);
+ return !skb_queue_empty(&msk->receive_queue);
}
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
@@ -1179,15 +1923,19 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
return -EOPNOTSUPP;
- lock_sock(sk);
+ mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
+ if (unlikely(sk->sk_state == TCP_LISTEN)) {
+ copied = -ENOTCONN;
+ goto out_err;
+ }
+
timeo = sock_rcvtimeo(sk, nonblock);
len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
- __mptcp_flush_join_list(msk);
- while (len > (size_t)copied) {
- int bytes_read;
+ while (copied < len) {
+ int bytes_read, old_space;
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
if (unlikely(bytes_read < 0)) {
@@ -1198,10 +1946,15 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
copied += bytes_read;
- if (skb_queue_empty(&sk->sk_receive_queue) &&
- __mptcp_move_skbs(msk))
+ if (skb_queue_empty(&msk->receive_queue) &&
+ __mptcp_move_skbs(msk, len - copied))
continue;
+ /* be sure to advertise window change */
+ old_space = READ_ONCE(msk->old_wspace);
+ if ((tcp_space(sk) - old_space) >= old_space)
+ mptcp_cleanup_rbuf(msk);
+
/* only the master socket status is relevant here. The exit
* conditions mirror closely tcp_recvmsg()
*/
@@ -1224,8 +1977,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
- if (sk->sk_shutdown & RCV_SHUTDOWN)
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ /* race breaker: the shutdown could be after the
+ * previous receive queue check
+ */
+ if (__mptcp_move_skbs(msk, len - copied))
+ continue;
break;
+ }
if (sk->sk_state == TCP_CLOSE) {
copied = -ENOTCONN;
@@ -1247,20 +2006,24 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
mptcp_wait_data(sk, &timeo);
}
- if (skb_queue_empty(&sk->sk_receive_queue)) {
+ if (skb_queue_empty_lockless(&sk->sk_receive_queue) &&
+ skb_queue_empty(&msk->receive_queue)) {
/* entire backlog drained, clear DATA_READY. */
clear_bit(MPTCP_DATA_READY, &msk->flags);
/* .. race-breaker: ssk might have gotten new data
* after last __mptcp_move_skbs() returned false.
*/
- if (unlikely(__mptcp_move_skbs(msk)))
+ if (unlikely(__mptcp_move_skbs(msk, 0)))
set_bit(MPTCP_DATA_READY, &msk->flags);
} else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
/* data to read but mptcp_wait_data() cleared DATA_READY */
set_bit(MPTCP_DATA_READY, &msk->flags);
}
out_err:
+ pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
+ msk, test_bit(MPTCP_DATA_READY, &msk->flags),
+ skb_queue_empty_lockless(&sk->sk_receive_queue), copied);
mptcp_rcv_space_adjust(msk, copied);
release_sock(sk);
@@ -1271,13 +2034,8 @@ static void mptcp_retransmit_handler(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq)) {
- mptcp_stop_timer(sk);
- } else {
- set_bit(MPTCP_WORK_RTX, &msk->flags);
- if (schedule_work(&msk->work))
- sock_hold(sk);
- }
+ set_bit(MPTCP_WORK_RTX, &msk->flags);
+ mptcp_schedule_work(sk);
}
static void mptcp_retransmit_timer(struct timer_list *t)
@@ -1299,6 +2057,14 @@ static void mptcp_retransmit_timer(struct timer_list *t)
sock_put(sk);
}
+static void mptcp_timeout_timer(struct timer_list *t)
+{
+ struct sock *sk = from_timer(sk, t, sk_timer);
+
+ mptcp_schedule_work(sk);
+ sock_put(sk);
+}
+
/* Find an idle subflow. Return NULL if there is unacked data at tcp
* level.
*
@@ -1311,12 +2077,21 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
sock_owned_by_me((const struct sock *)msk);
+ if (__mptcp_check_fallback(msk))
+ return NULL;
+
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
+
/* still data outstanding at TCP level? Don't retransmit. */
- if (!tcp_write_queue_empty(ssk))
+ if (!tcp_write_queue_empty(ssk)) {
+ if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss)
+ continue;
return NULL;
+ }
if (subflow->backup) {
if (!backup)
@@ -1338,21 +2113,45 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
* so we need to use tcp_close() after detaching them from the mptcp
* parent socket.
*/
-static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
- struct mptcp_subflow_context *subflow,
- long timeout)
+void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow)
{
- struct socket *sock = READ_ONCE(ssk->sk_socket);
+ bool dispose_socket = false;
+ struct socket *sock;
list_del(&subflow->node);
- if (sock && sock != sk->sk_socket) {
- /* outgoing subflow */
- sock_release(sock);
+ lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
+
+ /* if we are invoked by the msk cleanup code, the subflow is
+ * already orphaned
+ */
+ sock = ssk->sk_socket;
+ if (sock) {
+ dispose_socket = sock != sk->sk_socket;
+ sock_orphan(ssk);
+ }
+
+ subflow->disposable = 1;
+
+ /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
+ * the ssk has been already destroyed, we just need to release the
+ * reference owned by msk;
+ */
+ if (!inet_csk(ssk)->icsk_ulp_ops) {
+ kfree_rcu(subflow, rcu);
} else {
- /* incoming subflow */
- tcp_close(ssk, timeout);
+ /* otherwise tcp will dispose of the ssk and subflow ctx */
+ __tcp_close(ssk, 0);
+
+ /* close acquired an extra ref */
+ __sock_put(ssk);
}
+ release_sock(ssk);
+ if (dispose_socket)
+ iput(SOCK_INODE(sock));
+
+ sock_put(ssk);
}
static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
@@ -1371,6 +2170,14 @@ static void pm_work(struct mptcp_sock *msk)
pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
mptcp_pm_nl_add_addr_received(msk);
}
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
+ mptcp_pm_nl_add_addr_send_ack(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
+ mptcp_pm_nl_rm_addr_received(msk);
+ }
if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
mptcp_pm_nl_fully_established(msk);
@@ -1383,24 +2190,90 @@ static void pm_work(struct mptcp_sock *msk)
spin_unlock_bh(&msk->pm.lock);
}
+static void __mptcp_close_subflow(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (inet_sk_state_load(ssk) != TCP_CLOSE)
+ continue;
+
+ __mptcp_close_ssk((struct sock *)msk, ssk, subflow);
+ }
+}
+
+static bool mptcp_check_close_timeout(const struct sock *sk)
+{
+ s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp;
+ struct mptcp_subflow_context *subflow;
+
+ if (delta >= TCP_TIMEWAIT_LEN)
+ return true;
+
+ /* if all subflows are in closed status don't bother with additional
+ * timeout
+ */
+ mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
+ if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) !=
+ TCP_CLOSE)
+ return false;
+ }
+ return true;
+}
+
+static void mptcp_check_fastclose(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = &msk->sk.icsk_inet.sk;
+
+ if (likely(!READ_ONCE(msk->rcv_fastclose)))
+ return;
+
+ mptcp_token_destroy(msk);
+
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(tcp_sk);
+ if (tcp_sk->sk_state != TCP_CLOSE) {
+ tcp_send_active_reset(tcp_sk, GFP_ATOMIC);
+ tcp_set_state(tcp_sk, TCP_CLOSE);
+ }
+ release_sock(tcp_sk);
+ }
+
+ inet_sk_state_store(sk, TCP_CLOSE);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);
+
+ mptcp_close_wake_up(sk);
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
- int orig_len, orig_offset, mss_now = 0, size_goal = 0;
+ struct mptcp_sendmsg_info info = {};
struct mptcp_data_frag *dfrag;
- u64 orig_write_seq;
size_t copied = 0;
- struct msghdr msg = {
- .msg_flags = MSG_DONTWAIT,
- };
- long timeo = 0;
+ int state, ret;
lock_sock(sk);
- mptcp_clean_una(sk);
+ state = sk->sk_state;
+ if (unlikely(state == TCP_CLOSE))
+ goto unlock;
+
mptcp_check_data_fin_ack(sk);
__mptcp_flush_join_list(msk);
- __mptcp_move_skbs(msk);
+
+ mptcp_check_fastclose(msk);
+
+ if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+ __mptcp_close_subflow(msk);
if (msk->pm.status)
pm_work(msk);
@@ -1408,8 +2281,21 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
+ __mptcp_check_send_data_fin(sk);
mptcp_check_data_fin(sk);
+ /* if the msk data is completely acked, or the socket timedout,
+ * there is no point in keeping around an orphaned sk
+ */
+ if (sock_flag(sk, SOCK_DEAD) &&
+ (mptcp_check_close_timeout(sk) ||
+ (state != sk->sk_state &&
+ ((1 << inet_sk_state_load(sk)) & (TCPF_CLOSE | TCPF_FIN_WAIT2))))) {
+ inet_sk_state_store(sk, TCP_CLOSE);
+ __mptcp_destroy_sock(sk);
+ goto unlock;
+ }
+
if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
goto unlock;
@@ -1417,39 +2303,30 @@ static void mptcp_worker(struct work_struct *work)
if (!dfrag)
goto unlock;
- if (!mptcp_ext_cache_refill(msk))
- goto reset_unlock;
-
ssk = mptcp_subflow_get_retrans(msk);
if (!ssk)
goto reset_unlock;
lock_sock(ssk);
- orig_len = dfrag->data_len;
- orig_offset = dfrag->offset;
- orig_write_seq = dfrag->data_seq;
- while (dfrag->data_len > 0) {
- int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo,
- &mss_now, &size_goal);
- if (ret < 0)
+ /* limit retransmission to the bytes already sent on some subflows */
+ info.sent = 0;
+ info.limit = dfrag->already_sent;
+ while (info.sent < dfrag->already_sent) {
+ if (!mptcp_alloc_tx_skb(sk, ssk))
+ break;
+
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ if (ret <= 0)
break;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
copied += ret;
- dfrag->data_len -= ret;
- dfrag->offset += ret;
-
- if (!mptcp_ext_cache_refill(msk))
- break;
+ info.sent += ret;
}
if (copied)
- tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
- size_goal);
-
- dfrag->data_seq = orig_write_seq;
- dfrag->offset = orig_offset;
- dfrag->data_len = orig_len;
+ tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+ info.size_goal);
mptcp_set_timeout(sk, ssk);
release_sock(ssk);
@@ -1472,9 +2349,17 @@ static int __mptcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&msk->conn_list);
INIT_LIST_HEAD(&msk->join_list);
INIT_LIST_HEAD(&msk->rtx_queue);
- __set_bit(MPTCP_SEND_SPACE, &msk->flags);
INIT_WORK(&msk->work, mptcp_worker);
-
+ __skb_queue_head_init(&msk->receive_queue);
+ __skb_queue_head_init(&msk->skb_tx_cache);
+ msk->out_of_order_queue = RB_ROOT;
+ msk->first_pending = NULL;
+ msk->wmem_reserved = 0;
+ msk->rmem_released = 0;
+ msk->tx_pending_data = 0;
+ msk->size_goal_cache = TCP_BASE_MSS;
+
+ msk->ack_hint = NULL;
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -1482,7 +2367,7 @@ static int __mptcp_init_sock(struct sock *sk)
/* re-use the csk retrans timer for MPTCP-level retrans */
timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
-
+ timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);
return 0;
}
@@ -1491,23 +2376,23 @@ static int mptcp_init_sock(struct sock *sk)
struct net *net = sock_net(sk);
int ret;
+ ret = __mptcp_init_sock(sk);
+ if (ret)
+ return ret;
+
if (!mptcp_is_enabled(net))
return -ENOPROTOOPT;
if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
return -ENOMEM;
- ret = __mptcp_init_sock(sk);
- if (ret)
- return ret;
-
ret = __mptcp_socket_create(mptcp_sk(sk));
if (ret)
return ret;
sk_sockets_allocated_inc(sk);
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
+ sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
return 0;
}
@@ -1516,11 +2401,15 @@ static void __mptcp_clear_xmit(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
+ struct sk_buff *skb;
- sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
-
+ WRITE_ONCE(msk->first_pending, NULL);
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
dfrag_clear(sk, dfrag);
+ while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) {
+ sk->sk_forward_alloc += skb->truesize;
+ kfree_skb(skb);
+ }
}
static void mptcp_cancel_work(struct sock *sk)
@@ -1528,10 +2417,10 @@ static void mptcp_cancel_work(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(sk);
if (cancel_work_sync(&msk->work))
- sock_put(sk);
+ __sock_put(sk);
}
-static void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
+void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
{
lock_sock(ssk);
@@ -1586,42 +2475,67 @@ static int mptcp_close_state(struct sock *sk)
return next & TCP_ACTION_FIN;
}
-static void mptcp_close(struct sock *sk, long timeout)
+static void __mptcp_check_send_data_fin(struct sock *sk)
{
- struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
- LIST_HEAD(conn_list);
- lock_sock(sk);
- sk->sk_shutdown = SHUTDOWN_MASK;
+ pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu",
+ msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk),
+ msk->snd_nxt, msk->write_seq);
- if (sk->sk_state == TCP_LISTEN) {
- inet_sk_state_store(sk, TCP_CLOSE);
- goto cleanup;
- } else if (sk->sk_state == TCP_CLOSE) {
- goto cleanup;
- }
+ /* we still need to enqueue subflows or not really shutting down,
+ * skip this
+ */
+ if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq ||
+ mptcp_send_head(sk))
+ return;
+
+ WRITE_ONCE(msk->snd_nxt, msk->write_seq);
+ /* fallback socket will not get data_fin/ack, can move to the next
+ * state now
+ */
if (__mptcp_check_fallback(msk)) {
- goto update_state;
- } else if (mptcp_close_state(sk)) {
- pr_debug("Sending DATA_FIN sk=%p", sk);
- WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
- WRITE_ONCE(msk->snd_data_fin_enable, 1);
+ if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_close_wake_up(sk);
+ } else if (sk->sk_state == TCP_FIN_WAIT1) {
+ inet_sk_state_store(sk, TCP_FIN_WAIT2);
+ }
+ }
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+ __mptcp_flush_join_list(msk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
- mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK);
- }
+ mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN);
}
+}
- sk_stream_wait_close(sk, timeout);
+static void __mptcp_wr_shutdown(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
-update_state:
- inet_sk_state_store(sk, TCP_CLOSE);
+ pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d",
+ msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state,
+ !!mptcp_send_head(sk));
+
+ /* will be ignored by fallback sockets */
+ WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
+ WRITE_ONCE(msk->snd_data_fin_enable, 1);
+
+ __mptcp_check_send_data_fin(sk);
+}
+
+static void __mptcp_destroy_sock(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ LIST_HEAD(conn_list);
+
+ pr_debug("msk=%p", msk);
-cleanup:
/* be sure to always acquire the join list lock, to sync vs
* mptcp_finish_join().
*/
@@ -1630,20 +2544,77 @@ cleanup:
spin_unlock_bh(&msk->join_list_lock);
list_splice_init(&msk->conn_list, &conn_list);
- __mptcp_clear_xmit(sk);
-
- release_sock(sk);
+ sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+ sk_stop_timer(sk, &sk->sk_timer);
+ msk->pm.status = 0;
list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- __mptcp_close_ssk(sk, ssk, subflow, timeout);
+ __mptcp_close_ssk(sk, ssk, subflow);
+ }
+
+ sk->sk_prot->destroy(sk);
+
+ WARN_ON_ONCE(msk->wmem_reserved);
+ WARN_ON_ONCE(msk->rmem_released);
+ sk_stream_kill_queues(sk);
+ xfrm_sk_free_policy(sk);
+ sk_refcnt_debug_release(sk);
+ sock_put(sk);
+}
+
+static void mptcp_close(struct sock *sk, long timeout)
+{
+ struct mptcp_subflow_context *subflow;
+ bool do_cancel_work = false;
+
+ lock_sock(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
+ inet_sk_state_store(sk, TCP_CLOSE);
+ goto cleanup;
}
- mptcp_cancel_work(sk);
+ if (mptcp_close_state(sk))
+ __mptcp_wr_shutdown(sk);
- __skb_queue_purge(&sk->sk_receive_queue);
+ sk_stream_wait_close(sk, timeout);
- sk_common_release(sk);
+cleanup:
+ /* orphan all the subflows */
+ inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
+ list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow, dispose_socket;
+ struct socket *sock;
+
+ slow = lock_sock_fast(ssk);
+ sock = ssk->sk_socket;
+ dispose_socket = sock && sock != sk->sk_socket;
+ sock_orphan(ssk);
+ unlock_sock_fast(ssk, slow);
+
+ /* for the outgoing subflows we additionally need to free
+ * the associated socket
+ */
+ if (dispose_socket)
+ iput(SOCK_INODE(sock));
+ }
+ sock_orphan(sk);
+
+ sock_hold(sk);
+ pr_debug("msk=%p state=%d", sk, sk->sk_state);
+ if (sk->sk_state == TCP_CLOSE) {
+ __mptcp_destroy_sock(sk);
+ do_cancel_work = true;
+ } else {
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
+ }
+ release_sock(sk);
+ if (do_cancel_work)
+ mptcp_cancel_work(sk);
+ sock_put(sk);
}
static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
@@ -1671,11 +2642,17 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
static int mptcp_disconnect(struct sock *sk, int flags)
{
- /* Should never be called.
- * inet_stream_connect() calls ->disconnect, but that
- * refers to the subflow socket, not the mptcp one.
- */
- WARN_ON_ONCE(1);
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ __mptcp_flush_join_list(msk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+ tcp_disconnect(ssk, flags);
+ release_sock(ssk);
+ }
return 0;
}
@@ -1714,18 +2691,24 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
WRITE_ONCE(msk->fully_established, false);
msk->write_seq = subflow_req->idsn + 1;
- atomic64_set(&msk->snd_una, msk->write_seq);
+ msk->snd_nxt = msk->write_seq;
+ msk->snd_una = msk->write_seq;
+ msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
+
if (mp_opt->mp_capable) {
msk->can_ack = true;
msk->remote_key = mp_opt->sndr_key;
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
ack_seq++;
WRITE_ONCE(msk->ack_seq, ack_seq);
+ WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
}
sock_reset_flag(nsk, SOCK_RCU_FREE);
/* will be fully established after successful MPC subflow creation */
inet_sk_state_store(nsk, TCP_SYN_RECV);
+
+ security_inet_csk_clone(nsk, req);
bh_unlock_sock(nsk);
/* keep a single reference */
@@ -1747,6 +2730,8 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
TCP_INIT_CWND * tp->advmss);
if (msk->rcvq_space.space == 0)
msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+
+ WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
}
static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
@@ -1771,7 +2756,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
if (sk_is_mptcp(newsk)) {
struct mptcp_subflow_context *subflow;
struct sock *new_mptcp_sock;
- struct sock *ssk = newsk;
subflow = mptcp_subflow_ctx(newsk);
new_mptcp_sock = subflow->conn;
@@ -1786,21 +2770,8 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
/* acquire the 2nd reference for the owning socket */
sock_hold(new_mptcp_sock);
-
- local_bh_disable();
- bh_lock_sock(new_mptcp_sock);
- msk = mptcp_sk(new_mptcp_sock);
- msk->first = newsk;
-
newsk = new_mptcp_sock;
- mptcp_copy_inaddrs(newsk, ssk);
- list_add(&subflow->node, &msk->conn_list);
-
- mptcp_rcv_space_init(msk, ssk);
- bh_unlock_sock(new_mptcp_sock);
-
- __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
- local_bh_enable();
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
} else {
MPTCP_INC_STATS(sock_net(sk),
MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
@@ -1809,14 +2780,25 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
return newsk;
}
-static void mptcp_destroy(struct sock *sk)
+void mptcp_destroy_common(struct mptcp_sock *msk)
{
- struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sock *sk = (struct sock *)msk;
+
+ __mptcp_clear_xmit(sk);
+
+ /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
+ skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
+ skb_rbtree_purge(&msk->out_of_order_queue);
mptcp_token_destroy(msk);
- if (msk->cached_ext)
- __skb_ext_put(msk->cached_ext);
+ mptcp_pm_free_anno_list(msk);
+}
+static void mptcp_destroy(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_destroy_common(msk);
sk_sockets_allocated_dec(sk);
}
@@ -1930,16 +2912,58 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
return -EOPNOTSUPP;
}
-#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \
- TCPF_WRITE_TIMER_DEFERRED)
+void __mptcp_data_acked(struct sock *sk)
+{
+ if (!sock_owned_by_user(sk))
+ __mptcp_clean_una(sk);
+ else
+ set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags);
-/* this is very alike tcp_release_cb() but we must handle differently a
- * different set of events
- */
+ if (mptcp_pending_data_fin_ack(sk))
+ mptcp_schedule_work(sk);
+}
+
+void __mptcp_check_push(struct sock *sk, struct sock *ssk)
+{
+ if (!mptcp_send_head(sk))
+ return;
+
+ if (!sock_owned_by_user(sk))
+ __mptcp_subflow_push_pending(sk, ssk);
+ else
+ set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+}
+
+#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
+
+/* processes deferred events and flush wmem */
static void mptcp_release_cb(struct sock *sk)
{
unsigned long flags, nflags;
+ /* push_pending may touch wmem_reserved, do it before the later
+ * cleanup
+ */
+ if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags))
+ __mptcp_clean_una(sk);
+ if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) {
+ /* mptcp_push_pending() acquires the subflow socket lock
+ *
+ * 1) can't be invoked in atomic scope
+ * 2) must avoid ABBA deadlock with msk socket spinlock: the RX
+ * datapath acquires the msk socket spinlock while helding
+ * the subflow socket lock
+ */
+
+ spin_unlock_bh(&sk->sk_lock.slock);
+ mptcp_push_pending(sk, 0);
+ spin_lock_bh(&sk->sk_lock.slock);
+ }
+
+ /* clear any wmem reservation and errors */
+ __mptcp_update_wmem(sk);
+ __mptcp_update_rmem(sk);
+
do {
flags = sk->sk_tsq_flags;
if (!(flags & MPTCP_DEFERRED_ALL))
@@ -1949,15 +2973,6 @@ static void mptcp_release_cb(struct sock *sk)
sock_release_ownership(sk);
- if (flags & TCPF_DELACK_TIMER_DEFERRED) {
- struct mptcp_sock *msk = mptcp_sk(sk);
- struct sock *ssk;
-
- ssk = mptcp_subflow_recv_lookup(msk);
- if (!ssk || !schedule_work(&msk->work))
- __sock_put(sk);
- }
-
if (flags & TCPF_WRITE_TIMER_DEFERRED) {
mptcp_retransmit_handler(sk);
__sock_put(sk);
@@ -2015,9 +3030,11 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->remote_key, subflow->remote_key);
WRITE_ONCE(msk->local_key, subflow->local_key);
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
+ WRITE_ONCE(msk->snd_nxt, msk->write_seq);
WRITE_ONCE(msk->ack_seq, ack_seq);
+ WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
WRITE_ONCE(msk->can_ack, 1);
- atomic64_set(&msk->snd_una, msk->write_seq);
+ WRITE_ONCE(msk->snd_una, msk->write_seq);
mptcp_pm_new_connection(msk, 0);
@@ -2033,9 +3050,9 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
write_unlock_bh(&sk->sk_callback_lock);
}
-bool mptcp_finish_join(struct sock *sk)
+bool mptcp_finish_join(struct sock *ssk)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct sock *parent = (void *)msk;
struct socket *parent_sock;
@@ -2056,12 +3073,14 @@ bool mptcp_finish_join(struct sock *sk)
/* active connections are already on conn_list, and we can't acquire
* msk lock here.
* use the join list lock as synchronization point and double-check
- * msk status to avoid racing with mptcp_close()
+ * msk status to avoid racing with __mptcp_destroy_sock()
*/
spin_lock_bh(&msk->join_list_lock);
ret = inet_sk_state_load(parent) == TCP_ESTABLISHED;
- if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node)))
+ if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) {
list_add_tail(&subflow->node, &msk->join_list);
+ sock_hold(ssk);
+ }
spin_unlock_bh(&msk->join_list_lock);
if (!ret)
return false;
@@ -2070,17 +3089,18 @@ bool mptcp_finish_join(struct sock *sk)
* at close time
*/
parent_sock = READ_ONCE(parent->sk_socket);
- if (parent_sock && !sk->sk_socket)
- mptcp_sock_graft(sk, parent_sock);
+ if (parent_sock && !ssk->sk_socket)
+ mptcp_sock_graft(ssk, parent_sock);
subflow->map_seq = READ_ONCE(msk->ack_seq);
return true;
}
-static bool mptcp_memory_free(const struct sock *sk, int wake)
+static void mptcp_shutdown(struct sock *sk, int how)
{
- struct mptcp_sock *msk = mptcp_sk(sk);
+ pr_debug("sk=%p, how=%d", sk, how);
- return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
+ if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk))
+ __mptcp_wr_shutdown(sk);
}
static struct proto mptcp_prot = {
@@ -2092,7 +3112,7 @@ static struct proto mptcp_prot = {
.accept = mptcp_accept,
.setsockopt = mptcp_setsockopt,
.getsockopt = mptcp_getsockopt,
- .shutdown = tcp_shutdown,
+ .shutdown = mptcp_shutdown,
.destroy = mptcp_destroy,
.sendmsg = mptcp_sendmsg,
.recvmsg = mptcp_recvmsg,
@@ -2103,8 +3123,8 @@ static struct proto mptcp_prot = {
.sockets_allocated = &mptcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
- .stream_memory_free = mptcp_memory_free,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.sysctl_mem = sysctl_tcp_mem,
.obj_size = sizeof(struct mptcp_sock),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
@@ -2247,6 +3267,23 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
struct mptcp_sock *msk = mptcp_sk(newsock->sk);
struct mptcp_subflow_context *subflow;
+ struct sock *newsk = newsock->sk;
+ bool slowpath;
+
+ slowpath = lock_sock_fast(newsk);
+
+ /* PM/worker can now acquire the first subflow socket
+ * lock without racing with listener queue cleanup,
+ * we can notify it, if needed.
+ */
+ subflow = mptcp_subflow_ctx(msk->first);
+ list_add(&subflow->node, &msk->conn_list);
+ sock_hold(msk->first);
+ if (mptcp_is_fully_established(newsk))
+ mptcp_pm_fully_established(msk);
+
+ mptcp_copy_inaddrs(newsk, msk->first);
+ mptcp_rcv_space_init(msk, msk->first);
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
@@ -2258,6 +3295,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (!ssk->sk_socket)
mptcp_sock_graft(ssk, newsock);
}
+ unlock_sock_fast(newsk, slowpath);
}
if (inet_csk_listen_poll(ssock->sk))
@@ -2276,6 +3314,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
0;
}
+static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+
+ if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
+ return 0;
+
+ if (sk_stream_is_writeable(sk))
+ return EPOLLOUT | EPOLLWRNORM;
+
+ set_bit(MPTCP_NOSPACE, &msk->flags);
+ smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
+ if (sk_stream_is_writeable(sk))
+ return EPOLLOUT | EPOLLWRNORM;
+
+ return 0;
+}
+
static __poll_t mptcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
@@ -2288,14 +3344,13 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
sock_poll_wait(file, sock, wait);
state = inet_sk_state_load(sk);
+ pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
if (state == TCP_LISTEN)
return mptcp_check_readable(msk);
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk);
- if (sk_stream_is_writeable(sk) &&
- test_bit(MPTCP_SEND_SPACE, &msk->flags))
- mask |= EPOLLOUT | EPOLLWRNORM;
+ mask |= mptcp_check_writeable(msk);
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
@@ -2303,66 +3358,6 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
return mask;
}
-static int mptcp_shutdown(struct socket *sock, int how)
-{
- struct mptcp_sock *msk = mptcp_sk(sock->sk);
- struct mptcp_subflow_context *subflow;
- int ret = 0;
-
- pr_debug("sk=%p, how=%d", msk, how);
-
- lock_sock(sock->sk);
-
- how++;
- if ((how & ~SHUTDOWN_MASK) || !how) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- if (sock->state == SS_CONNECTING) {
- if ((1 << sock->sk->sk_state) &
- (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
- sock->state = SS_DISCONNECTING;
- else
- sock->state = SS_CONNECTED;
- }
-
- /* If we've already sent a FIN, or it's a closed state, skip this. */
- if (__mptcp_check_fallback(msk)) {
- if (how == SHUT_WR || how == SHUT_RDWR)
- inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
-
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
-
- mptcp_subflow_shutdown(sock->sk, tcp_sk, how);
- }
- } else if ((how & SEND_SHUTDOWN) &&
- ((1 << sock->sk->sk_state) &
- (TCPF_ESTABLISHED | TCPF_SYN_SENT |
- TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) &&
- mptcp_close_state(sock->sk)) {
- __mptcp_flush_join_list(msk);
-
- WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
- WRITE_ONCE(msk->snd_data_fin_enable, 1);
-
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
-
- mptcp_subflow_shutdown(sock->sk, tcp_sk, how);
- }
- }
-
- /* Wake up anyone sleeping in poll. */
- sock->sk->sk_state_change(sock->sk);
-
-out_unlock:
- release_sock(sock->sk);
-
- return ret;
-}
-
static const struct proto_ops mptcp_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
@@ -2376,7 +3371,7 @@ static const struct proto_ops mptcp_stream_ops = {
.ioctl = inet_ioctl,
.gettstamp = sock_gettstamp,
.listen = mptcp_listen,
- .shutdown = mptcp_shutdown,
+ .shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
@@ -2426,7 +3421,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.ioctl = inet6_ioctl,
.gettstamp = sock_gettstamp,
.listen = mptcp_listen,
- .shutdown = mptcp_shutdown,
+ .shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet6_sendmsg,
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 20f04ac85409..d67de793d363 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -23,6 +23,7 @@
#define OPTION_MPTCP_ADD_ADDR BIT(6)
#define OPTION_MPTCP_ADD_ADDR6 BIT(7)
#define OPTION_MPTCP_RM_ADDR BIT(8)
+#define OPTION_MPTCP_FASTCLOSE BIT(9)
/* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE 0
@@ -49,15 +50,16 @@
#define TCPOLEN_MPTCP_DSS_MAP64 14
#define TCPOLEN_MPTCP_DSS_CHECKSUM 2
#define TCPOLEN_MPTCP_ADD_ADDR 16
-#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18
+#define TCPOLEN_MPTCP_ADD_ADDR_PORT 20
#define TCPOLEN_MPTCP_ADD_ADDR_BASE 8
-#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 12
#define TCPOLEN_MPTCP_ADD_ADDR6 28
-#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30
+#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 32
#define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20
-#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22
-#define TCPOLEN_MPTCP_PORT_LEN 2
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24
+#define TCPOLEN_MPTCP_PORT_LEN 4
#define TCPOLEN_MPTCP_RM_ADDR_BASE 4
+#define TCPOLEN_MPTCP_FASTCLOSE 12
/* MPTCP MP_JOIN flags */
#define MPTCPOPT_BACKUP BIT(0)
@@ -86,10 +88,20 @@
/* MPTCP socket flags */
#define MPTCP_DATA_READY 0
-#define MPTCP_SEND_SPACE 1
+#define MPTCP_NOSPACE 1
#define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4
+#define MPTCP_WORK_CLOSE_SUBFLOW 5
+#define MPTCP_PUSH_PENDING 6
+#define MPTCP_CLEAN_UNA 7
+
+static inline bool before64(__u64 seq1, __u64 seq2)
+{
+ return (__s64)(seq1 - seq2) < 0;
+}
+
+#define after64(seq2, seq1) before64(seq1, seq2)
struct mptcp_options_received {
u64 sndr_key;
@@ -100,6 +112,7 @@ struct mptcp_options_received {
u16 data_len;
u16 mp_capable : 1,
mp_join : 1,
+ fastclose : 1,
dss : 1,
add_addr : 1,
rm_addr : 1,
@@ -109,7 +122,7 @@ struct mptcp_options_received {
u32 token;
u32 nonce;
u64 thmac;
- u8 hmac[20];
+ u8 hmac[MPTCPOPT_HMAC_LEN];
u8 join_id;
u8 use_map:1,
dsn64:1,
@@ -140,6 +153,8 @@ struct mptcp_addr_info {
sa_family_t family;
__be16 port;
u8 id;
+ u8 flags;
+ int ifindex;
union {
struct in_addr addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -150,17 +165,29 @@ struct mptcp_addr_info {
enum mptcp_pm_status {
MPTCP_PM_ADD_ADDR_RECEIVED,
+ MPTCP_PM_ADD_ADDR_SEND_ACK,
+ MPTCP_PM_RM_ADDR_RECEIVED,
MPTCP_PM_ESTABLISHED,
+ MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */
MPTCP_PM_SUBFLOW_ESTABLISHED,
};
+enum mptcp_addr_signal_status {
+ MPTCP_ADD_ADDR_SIGNAL,
+ MPTCP_ADD_ADDR_ECHO,
+ MPTCP_ADD_ADDR_IPV6,
+ MPTCP_ADD_ADDR_PORT,
+ MPTCP_RM_ADDR_SIGNAL,
+};
+
struct mptcp_pm_data {
struct mptcp_addr_info local;
struct mptcp_addr_info remote;
+ struct list_head anno_list;
spinlock_t lock; /*protects the whole PM data */
- bool addr_signal;
+ u8 addr_signal;
bool server_side;
bool work_pending;
bool accept_addr;
@@ -174,14 +201,16 @@ struct mptcp_pm_data {
u8 local_addr_max;
u8 subflows_max;
u8 status;
+ u8 rm_id;
};
struct mptcp_data_frag {
struct list_head list;
u64 data_seq;
- int data_len;
- int offset;
- int overhead;
+ u16 data_len;
+ u16 offset;
+ u16 overhead;
+ u16 already_sent;
struct page *page;
};
@@ -192,22 +221,40 @@ struct mptcp_sock {
u64 local_key;
u64 remote_key;
u64 write_seq;
+ u64 snd_nxt;
u64 ack_seq;
+ u64 rcv_wnd_sent;
u64 rcv_data_fin_seq;
- atomic64_t snd_una;
+ int wmem_reserved;
+ struct sock *last_snd;
+ int snd_burst;
+ int old_wspace;
+ u64 snd_una;
+ u64 wnd_end;
unsigned long timer_ival;
u32 token;
+ int rmem_pending;
+ int rmem_released;
unsigned long flags;
bool can_ack;
bool fully_established;
bool rcv_data_fin;
bool snd_data_fin_enable;
+ bool rcv_fastclose;
+ bool use_64bit_ack; /* Set when we received a 64-bit DSN */
spinlock_t join_list_lock;
+ struct sock *ack_hint;
struct work_struct work;
+ struct sk_buff *ooo_last_skb;
+ struct rb_root out_of_order_queue;
+ struct sk_buff_head receive_queue;
+ struct sk_buff_head skb_tx_cache; /* this is wmem accounted */
+ int tx_pending_data;
+ int size_goal_cache;
struct list_head conn_list;
struct list_head rtx_queue;
+ struct mptcp_data_frag *first_pending;
struct list_head join_list;
- struct skb_ext *cached_ext; /* for the next sendmsg */
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first;
struct mptcp_pm_data pm;
@@ -219,6 +266,22 @@ struct mptcp_sock {
} rcvq_space;
};
+#define mptcp_lock_sock(___sk, cb) do { \
+ struct sock *__sk = (___sk); /* silence macro reuse warning */ \
+ might_sleep(); \
+ spin_lock_bh(&__sk->sk_lock.slock); \
+ if (__sk->sk_lock.owned) \
+ __lock_sock(__sk); \
+ cb; \
+ __sk->sk_lock.owned = 1; \
+ spin_unlock(&__sk->sk_lock.slock); \
+ mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \
+ local_bh_enable(); \
+} while (0)
+
+#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
+#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
+
#define mptcp_for_each_subflow(__msk, __subflow) \
list_for_each_entry(__subflow, &((__msk)->conn_list), node)
@@ -227,11 +290,46 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
return (struct mptcp_sock *)sk;
}
+static inline int __mptcp_space(const struct sock *sk)
+{
+ return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_pending);
+}
+
+static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+
+ return READ_ONCE(msk->first_pending);
+}
+
+static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *cur;
+
+ cur = msk->first_pending;
+ return list_is_last(&cur->list, &msk->rtx_queue) ? NULL :
+ list_next_entry(cur, list);
+}
+
+static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (!msk->first_pending)
+ return NULL;
+
+ if (WARN_ON_ONCE(list_empty(&msk->rtx_queue)))
+ return NULL;
+
+ return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
+}
+
static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- if (list_empty(&msk->rtx_queue))
+ if (!before64(msk->snd_nxt, READ_ONCE(msk->snd_una)))
return NULL;
return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
@@ -268,6 +366,12 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
return (struct mptcp_subflow_request_sock *)rsk;
}
+enum mptcp_data_avail {
+ MPTCP_SUBFLOW_NODATA,
+ MPTCP_SUBFLOW_DATA_AVAIL,
+ MPTCP_SUBFLOW_OOO_DATA
+};
+
/* MPTCP subflow context */
struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */
@@ -292,10 +396,10 @@ struct mptcp_subflow_context {
map_valid : 1,
mpc_map : 1,
backup : 1,
- data_avail : 1,
rx_eof : 1,
- use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
- can_ack : 1; /* only after processing the remote a key */
+ can_ack : 1, /* only after processing the remote a key */
+ disposable : 1; /* ctx can be free at ulp release time */
+ enum mptcp_data_avail data_avail;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@@ -343,15 +447,28 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
return subflow->map_seq + mptcp_subflow_get_map_offset(subflow);
}
+static inline void mptcp_add_pending_subflow(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ sock_hold(mptcp_subflow_tcp_sock(subflow));
+ spin_lock_bh(&msk->join_list_lock);
+ list_add_tail(&subflow->node, &msk->join_list);
+ spin_unlock_bh(&msk->join_list_lock);
+}
+
int mptcp_is_enabled(struct net *net);
+unsigned int mptcp_get_add_addr_timeout(struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool mptcp_subflow_data_available(struct sock *sk);
void __init mptcp_subflow_init(void);
+void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
+void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow);
+void mptcp_subflow_reset(struct sock *ssk);
/* called with sk socket lock held */
-int __mptcp_subflow_connect(struct sock *sk, int ifindex,
- const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
@@ -385,9 +502,19 @@ static inline bool mptcp_is_fully_established(struct sock *sk)
void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
-void mptcp_data_acked(struct sock *sk);
+bool mptcp_schedule_work(struct sock *sk);
+void __mptcp_check_push(struct sock *sk, struct sock *ssk);
+void __mptcp_data_acked(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
+void __mptcp_flush_join_list(struct mptcp_sock *msk);
+static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->snd_data_fin_enable) &&
+ READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
+}
+
+void mptcp_destroy_common(struct mptcp_sock *msk);
void __init mptcp_token_init(void);
static inline void mptcp_token_init_request(struct request_sock *req)
@@ -421,26 +548,62 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk,
void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id);
void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
+void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
+void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id);
+void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
+struct mptcp_pm_add_entry *
+mptcp_pm_del_add_timer(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr);
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr);
+ const struct mptcp_addr_info *addr,
+ bool echo, bool port);
int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id);
-int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id);
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id);
+
+static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL);
+}
+
+static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO);
+}
-static inline bool mptcp_pm_should_signal(struct mptcp_sock *msk)
+static inline bool mptcp_pm_should_add_signal_ipv6(struct mptcp_sock *msk)
{
- return READ_ONCE(msk->pm.addr_signal);
+ return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_IPV6);
}
-static inline unsigned int mptcp_add_addr_len(int family)
+static inline bool mptcp_pm_should_add_signal_port(struct mptcp_sock *msk)
{
- if (family == AF_INET)
- return TCPOLEN_MPTCP_ADD_ADDR;
- return TCPOLEN_MPTCP_ADD_ADDR6;
+ return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_PORT);
}
-bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
- struct mptcp_addr_info *saddr);
+static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL);
+}
+
+static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port)
+{
+ u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
+
+ if (family == AF_INET6)
+ len = TCPOLEN_MPTCP_ADD_ADDR6_BASE;
+ if (!echo)
+ len += MPTCPOPT_THMAC_LEN;
+ if (port)
+ len += TCPOLEN_MPTCP_PORT_LEN;
+
+ return len;
+}
+
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_addr_info *saddr, bool *echo, bool *port);
+bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ u8 *rm_id);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
void __init mptcp_pm_nl_init(void);
@@ -448,6 +611,9 @@ void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
void mptcp_pm_nl_fully_established(struct mptcp_sock *msk);
void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk);
void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk);
+void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk);
+void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk);
+void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
@@ -455,21 +621,14 @@ static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP);
}
-static inline bool before64(__u64 seq1, __u64 seq2)
-{
- return (__s64)(seq1 - seq2) < 0;
-}
-
-#define after64(seq2, seq1) before64(seq1, seq2)
-
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
-static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
+static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
{
return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
-static inline bool mptcp_check_fallback(struct sock *sk)
+static inline bool mptcp_check_fallback(const struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 6f035af1c9d2..278cbe3e539e 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -10,7 +10,7 @@
#include <linux/module.h>
#include <linux/netdevice.h>
#include <crypto/algapi.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
@@ -20,6 +20,7 @@
#include <net/ip6_route.h>
#endif
#include <net/mptcp.h>
+#include <uapi/linux/mptcp.h>
#include "protocol.h"
#include "mib.h"
@@ -111,9 +112,14 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li
return 0;
}
-static void subflow_init_req(struct request_sock *req,
- const struct sock *sk_listener,
- struct sk_buff *skb)
+/* Init mptcp request socket.
+ *
+ * Returns an error code if a JOIN has failed and a TCP reset
+ * should be sent.
+ */
+static int subflow_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
{
struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
@@ -124,7 +130,7 @@ static void subflow_init_req(struct request_sock *req,
ret = __subflow_init_req(req, sk_listener);
if (ret)
- return;
+ return 0;
mptcp_get_options(skb, &mp_opt);
@@ -132,7 +138,7 @@ static void subflow_init_req(struct request_sock *req,
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
if (mp_opt.mp_join)
- return;
+ return 0;
} else if (mp_opt.mp_join) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
}
@@ -156,7 +162,7 @@ again:
} else {
subflow_req->mp_capable = 1;
}
- return;
+ return 0;
}
err = mptcp_token_new_request(req);
@@ -174,7 +180,11 @@ again:
subflow_req->remote_nonce = mp_opt.nonce;
subflow_req->msk = subflow_token_join_request(req, skb);
- if (unlikely(req->syncookie) && subflow_req->msk) {
+ /* Can't fall back to TCP in this case. */
+ if (!subflow_req->msk)
+ return -EPERM;
+
+ if (unlikely(req->syncookie)) {
if (mptcp_can_accept_new_subflow(subflow_req->msk))
subflow_init_req_cookie_join_save(subflow_req, skb);
}
@@ -182,6 +192,8 @@ again:
pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
subflow_req->remote_nonce, subflow_req->msk);
}
+
+ return 0;
}
int mptcp_subflow_init_cookie_req(struct request_sock *req,
@@ -227,27 +239,53 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
}
EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
-static void subflow_v4_init_req(struct request_sock *req,
- const struct sock *sk_listener,
- struct sk_buff *skb)
+static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
+ struct sk_buff *skb,
+ struct flowi *fl,
+ struct request_sock *req)
{
+ struct dst_entry *dst;
+ int err;
+
tcp_rsk(req)->is_mptcp = 1;
- tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
+ dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
+ if (!dst)
+ return NULL;
+
+ err = subflow_init_req(req, sk, skb);
+ if (err == 0)
+ return dst;
- subflow_init_req(req, sk_listener, skb);
+ dst_release(dst);
+ if (!req->syncookie)
+ tcp_request_sock_ops.send_reset(sk, skb);
+ return NULL;
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
-static void subflow_v6_init_req(struct request_sock *req,
- const struct sock *sk_listener,
- struct sk_buff *skb)
+static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
+ struct sk_buff *skb,
+ struct flowi *fl,
+ struct request_sock *req)
{
+ struct dst_entry *dst;
+ int err;
+
tcp_rsk(req)->is_mptcp = 1;
- tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
+ dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
+ if (!dst)
+ return NULL;
+
+ err = subflow_init_req(req, sk, skb);
+ if (err == 0)
+ return dst;
- subflow_init_req(req, sk_listener, skb);
+ dst_release(dst);
+ if (!req->syncookie)
+ tcp6_request_sock_ops.send_reset(sk, skb);
+ return NULL;
}
#endif
@@ -270,6 +308,24 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
return thmac == subflow->thmac;
}
+void mptcp_subflow_reset(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+
+ /* must hold: tcp_done() could drop last reference on parent */
+ sock_hold(sk);
+
+ tcp_set_state(ssk, TCP_CLOSE);
+ tcp_send_active_reset(ssk, GFP_ATOMIC);
+ tcp_done(ssk);
+ if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
+ schedule_work(&mptcp_sk(sk)->work))
+ return; /* worker will put sk for us */
+
+ sock_put(sk);
+}
+
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -342,8 +398,7 @@ fallback:
return;
do_reset:
- tcp_send_active_reset(sk, GFP_ATOMIC);
- tcp_done(sk);
+ mptcp_subflow_reset(sk);
}
struct request_sock_ops mptcp_subflow_request_sock_ops;
@@ -434,7 +489,7 @@ static void mptcp_sock_destruct(struct sock *sk)
sock_orphan(sk);
}
- mptcp_token_destroy(mptcp_sk(sk));
+ mptcp_destroy_common(mptcp_sk(sk));
inet_sock_destruct(sk);
}
@@ -530,9 +585,8 @@ create_msk:
fallback = true;
} else if (subflow_req->mp_join) {
mptcp_get_options(skb, &mp_opt);
- if (!mp_opt.mp_join ||
- !mptcp_can_accept_new_subflow(subflow_req->msk) ||
- !subflow_hmac_valid(req, &mp_opt)) {
+ if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) ||
+ !mptcp_can_accept_new_subflow(subflow_req->msk)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
fallback = true;
}
@@ -565,6 +619,11 @@ create_child:
*/
inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
+ /* record the newly created socket as the first msk
+ * subflow, but don't link it yet into conn_list
+ */
+ WRITE_ONCE(mptcp_sk(new_msk)->first, child);
+
/* new mpc subflow takes ownership of the newly
* created mptcp socket
*/
@@ -769,12 +828,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
if (!mpext->dsn64) {
map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
mpext->data_seq);
- subflow->use_64bit_ack = 0;
pr_debug("expanded seq=%llu", subflow->map_seq);
} else {
map_seq = mpext->data_seq;
- subflow->use_64bit_ack = 1;
}
+ WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
if (subflow->map_valid) {
/* Allow replacing only with an identical map */
@@ -817,16 +875,23 @@ validate_seq:
return MAPPING_OK;
}
-static int subflow_read_actor(read_descriptor_t *desc,
- struct sk_buff *skb,
- unsigned int offset, size_t len)
+static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
+ u64 limit)
{
- size_t copy_len = min(desc->count, len);
-
- desc->count -= copy_len;
-
- pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
- return copy_len;
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ u32 incr;
+
+ incr = limit >= skb->len ? skb->len + fin : limit;
+
+ pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
+ subflow->map_subflow_seq);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
+ tcp_sk(ssk)->copied_seq += incr;
+ if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
+ sk_eat_skb(ssk, skb);
+ if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
+ subflow->map_valid = 0;
}
static bool subflow_check_data_avail(struct sock *ssk)
@@ -838,13 +903,13 @@ static bool subflow_check_data_avail(struct sock *ssk)
pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
+ if (!skb_peek(&ssk->sk_receive_queue))
+ subflow->data_avail = 0;
if (subflow->data_avail)
return true;
msk = mptcp_sk(subflow->conn);
for (;;) {
- u32 map_remaining;
- size_t delta;
u64 ack_seq;
u64 old_ack;
@@ -862,6 +927,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
subflow->map_data_len = skb->len;
subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
subflow->ssn_offset;
+ subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
return true;
}
@@ -889,42 +955,18 @@ static bool subflow_check_data_avail(struct sock *ssk)
ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
ack_seq);
- if (ack_seq == old_ack)
+ if (ack_seq == old_ack) {
+ subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
+ break;
+ } else if (after64(ack_seq, old_ack)) {
+ subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
break;
+ }
/* only accept in-sequence mapping. Old values are spurious
- * retransmission; we can hit "future" values on active backup
- * subflow switch, we relay on retransmissions to get
- * in-sequence data.
- * Cuncurrent subflows support will require subflow data
- * reordering
+ * retransmission
*/
- map_remaining = subflow->map_data_len -
- mptcp_subflow_get_map_offset(subflow);
- if (before64(ack_seq, old_ack))
- delta = min_t(size_t, old_ack - ack_seq, map_remaining);
- else
- delta = min_t(size_t, ack_seq - old_ack, map_remaining);
-
- /* discard mapped data */
- pr_debug("discarding %zu bytes, current map len=%d", delta,
- map_remaining);
- if (delta) {
- read_descriptor_t desc = {
- .count = delta,
- };
- int ret;
-
- ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
- if (ret < 0) {
- ssk->sk_err = -ret;
- goto fatal;
- }
- if (ret < delta)
- return false;
- if (delta == map_remaining)
- subflow->map_valid = 0;
- }
+ mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
}
return true;
@@ -935,13 +977,13 @@ fatal:
ssk->sk_error_report(ssk);
tcp_set_state(ssk, TCP_CLOSE);
tcp_send_active_reset(ssk, GFP_ATOMIC);
+ subflow->data_avail = 0;
return false;
}
bool mptcp_subflow_data_available(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sk_buff *skb;
/* check if current mapping is still valid */
if (subflow->map_valid &&
@@ -954,15 +996,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
subflow->map_data_len);
}
- if (!subflow_check_data_avail(sk)) {
- subflow->data_avail = 0;
- return false;
- }
-
- skb = skb_peek(&sk->sk_receive_queue);
- subflow->data_avail = skb &&
- before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
- return subflow->data_avail;
+ return subflow_check_data_avail(sk);
}
/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
@@ -979,7 +1013,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
const struct sock *sk = subflow->conn;
- *space = tcp_space(sk);
+ *space = __mptcp_space(sk);
*full_space = tcp_full_space(sk);
}
@@ -1004,18 +1038,9 @@ static void subflow_data_ready(struct sock *sk)
mptcp_data_ready(parent, sk);
}
-static void subflow_write_space(struct sock *sk)
+static void subflow_write_space(struct sock *ssk)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sock *parent = subflow->conn;
-
- sk_stream_write_space(sk);
- if (sk_stream_is_writeable(sk)) {
- set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
- smp_mb__after_atomic();
- /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
- sk_stream_write_space(parent);
- }
+ /* we take action in __mptcp_clean_una() */
}
static struct inet_connection_sock_af_ops *
@@ -1069,8 +1094,7 @@ static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
#endif
}
-int __mptcp_subflow_connect(struct sock *sk, int ifindex,
- const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1115,7 +1139,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
if (loc->family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
- ssk->sk_bound_dev_if = ifindex;
+ ssk->sk_bound_dev_if = loc->ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
if (err)
goto failed;
@@ -1127,24 +1151,51 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
subflow->local_id = local_id;
subflow->remote_id = remote_id;
subflow->request_join = 1;
- subflow->request_bkup = 1;
+ subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
mptcp_info2sockaddr(remote, &addr);
+ mptcp_add_pending_subflow(msk, subflow);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
if (err && err != -EINPROGRESS)
- goto failed;
+ goto failed_unlink;
+ return err;
+
+failed_unlink:
spin_lock_bh(&msk->join_list_lock);
- list_add_tail(&subflow->node, &msk->join_list);
+ list_del(&subflow->node);
spin_unlock_bh(&msk->join_list_lock);
- return err;
-
failed:
+ subflow->disposable = 1;
sock_release(sf);
return err;
}
+static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
+{
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
+ *child_skcd = &child->sk_cgrp_data;
+
+ /* only the additional subflows created by kworkers have to be modified */
+ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
+ cgroup_id(sock_cgroup_ptr(child_skcd))) {
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg = parent->sk_memcg;
+
+ mem_cgroup_sk_free(child);
+ if (memcg && css_tryget(&memcg->css))
+ child->sk_memcg = memcg;
+#endif /* CONFIG_MEMCG */
+
+ cgroup_sk_free(child_skcd);
+ *child_skcd = *parent_skcd;
+ cgroup_sk_clone(child_skcd);
+ }
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+}
+
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{
struct mptcp_subflow_context *subflow;
@@ -1165,6 +1216,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
lock_sock(sf->sk);
+ /* the newly created socket has to be in the same cgroup as its parent */
+ mptcp_attach_cgroup(sk, sf->sk);
+
/* kernel sockets do not by default acquire net ref, but TCP timer
* needs it.
*/
@@ -1263,7 +1317,6 @@ static void subflow_state_change(struct sock *sk)
mptcp_data_ready(parent, sk);
if (__mptcp_check_fallback(mptcp_sk(parent)) &&
- !(parent->sk_shutdown & RCV_SHUTDOWN) &&
!subflow->rx_eof && subflow_is_done(sk)) {
subflow->rx_eof = 1;
mptcp_subflow_eof(parent);
@@ -1306,17 +1359,27 @@ out:
return err;
}
-static void subflow_ulp_release(struct sock *sk)
+static void subflow_ulp_release(struct sock *ssk)
{
- struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
+ struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
+ bool release = true;
+ struct sock *sk;
if (!ctx)
return;
- if (ctx->conn)
- sock_put(ctx->conn);
+ sk = ctx->conn;
+ if (sk) {
+ /* if the msk has been orphaned, keep the ctx
+ * alive, will be freed by __mptcp_close_ssk(),
+ * when the subflow is still unaccepted
+ */
+ release = ctx->disposable || list_empty(&ctx->node);
+ sock_put(sk);
+ }
- kfree_rcu(ctx, rcu);
+ if (release)
+ kfree_rcu(ctx, rcu);
}
static void subflow_ulp_clone(const struct request_sock *req,
@@ -1401,7 +1464,7 @@ void __init mptcp_subflow_init(void)
panic("MPTCP: failed to init subflow request sock ops\n");
subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
- subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
+ subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
subflow_specific = ipv4_specific;
subflow_specific.conn_request = subflow_v4_conn_request;
@@ -1410,7 +1473,7 @@ void __init mptcp_subflow_init(void)
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
- subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
+ subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
subflow_v6_specific = ipv6_specific;
subflow_v6_specific.conn_request = subflow_v6_conn_request;
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 8b47c4bb1c6b..feb4b9ffd462 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -291,7 +291,7 @@ struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
{
struct mptcp_sock *ret = NULL;
struct hlist_nulls_node *pos;
- int slot, num;
+ int slot, num = 0;
for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) {
struct token_bucket *bucket = &token_hash[slot];
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index f1be3e3f6425..a9cb355324d1 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -1726,9 +1726,6 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
ndp->ptype.dev = dev;
dev_add_pack(&ndp->ptype);
- /* Set up generic netlink interface */
- ncsi_init_netlink(dev);
-
pdev = to_platform_device(dev->dev.parent);
if (pdev) {
np = pdev->dev.of_node;
@@ -1892,8 +1889,6 @@ void ncsi_unregister_dev(struct ncsi_dev *nd)
list_del_rcu(&ndp->node);
spin_unlock_irqrestore(&ncsi_dev_lock, flags);
- ncsi_unregister_netlink(nd->dev);
-
kfree(ndp);
}
EXPORT_SYMBOL_GPL(ncsi_unregister_dev);
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 8b386d766e7d..bb5f1650f11c 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -716,7 +716,7 @@ static int ncsi_set_channel_mask_nl(struct sk_buff *msg,
return 0;
}
-static const struct genl_ops ncsi_ops[] = {
+static const struct genl_small_ops ncsi_ops[] = {
{
.cmd = NCSI_CMD_PKG_INFO,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -762,28 +762,12 @@ static struct genl_family ncsi_genl_family __ro_after_init = {
.maxattr = NCSI_ATTR_MAX,
.policy = ncsi_genl_policy,
.module = THIS_MODULE,
- .ops = ncsi_ops,
- .n_ops = ARRAY_SIZE(ncsi_ops),
+ .small_ops = ncsi_ops,
+ .n_small_ops = ARRAY_SIZE(ncsi_ops),
};
-int ncsi_init_netlink(struct net_device *dev)
+static int __init ncsi_init_netlink(void)
{
- int rc;
-
- rc = genl_register_family(&ncsi_genl_family);
- if (rc)
- netdev_err(dev, "ncsi: failed to register netlink family\n");
-
- return rc;
-}
-
-int ncsi_unregister_netlink(struct net_device *dev)
-{
- int rc;
-
- rc = genl_unregister_family(&ncsi_genl_family);
- if (rc)
- netdev_err(dev, "ncsi: failed to unregister netlink family\n");
-
- return rc;
+ return genl_register_family(&ncsi_genl_family);
}
+subsys_initcall(ncsi_init_netlink);
diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h
index 7502723fba83..39a1a9d7bf77 100644
--- a/net/ncsi/ncsi-netlink.h
+++ b/net/ncsi/ncsi-netlink.h
@@ -22,7 +22,4 @@ int ncsi_send_netlink_err(struct net_device *dev,
struct nlmsghdr *nlhdr,
int err);
-int ncsi_init_netlink(struct net_device *dev);
-int ncsi_unregister_netlink(struct net_device *dev);
-
#endif /* __NCSI_NETLINK_H__ */
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 5b1f4ec66dd9..888ccc2d4e34 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -1120,7 +1120,7 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
int payload, i, ret;
/* Find the NCSI device */
- nd = ncsi_find_dev(dev);
+ nd = ncsi_find_dev(orig_dev);
ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL;
if (!ndp)
return -ENODEV;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 25313c29d799..49fbef0d99be 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -441,6 +441,7 @@ endif # NF_CONNTRACK
config NF_TABLES
select NETFILTER_NETLINK
+ select LIBCRC32C
tristate "Netfilter nf_tables support"
help
nftables is the new packet classification framework that intends to
@@ -681,6 +682,16 @@ config NFT_FIB_NETDEV
The lookup will be delegated to the IPv4 or IPv6 FIB depending
on the protocol of the packet.
+config NFT_REJECT_NETDEV
+ depends on NFT_REJECT_IPV4
+ depends on NFT_REJECT_IPV6
+ tristate "Netfilter nf_tables netdev REJECT support"
+ help
+ This option enables the REJECT support from the netdev table.
+ The return packet generation will be delegated to the IPv4
+ or IPv6 ICMP or TCP RST implementation depending on the
+ protocol of the packet.
+
endif # NF_TABLES_NETDEV
endif # NF_TABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 0e0ded87e27b..33da7bf1b68e 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
+obj-$(CONFIG_NFT_REJECT_NETDEV) += nft_reject_netdev.o
obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o
obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
obj-$(CONFIG_NFT_LOG) += nft_log.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 3ac7c8c1548d..63d032191e62 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -282,6 +282,16 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
return NULL;
return net->nf.hooks_bridge + hooknum;
#endif
+#ifdef CONFIG_NETFILTER_INGRESS
+ case NFPROTO_INET:
+ if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS))
+ return NULL;
+ if (!dev || dev_net(dev) != net) {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+ return &dev->nf_hooks_ingress;
+#endif
case NFPROTO_IPV4:
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum))
return NULL;
@@ -311,20 +321,80 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
return NULL;
}
+static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg,
+ int hooknum)
+{
+#ifndef CONFIG_NETFILTER_INGRESS
+ if (reg->hooknum == hooknum)
+ return -EOPNOTSUPP;
+#endif
+ if (reg->hooknum != hooknum ||
+ !reg->dev || dev_net(reg->dev) != net)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf)
+{
+ if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) ||
+ (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS))
+ return true;
+
+ return false;
+}
+
+static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf)
+{
+#ifdef CONFIG_JUMP_LABEL
+ int hooknum;
+
+ if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
+ pf = NFPROTO_NETDEV;
+ hooknum = NF_NETDEV_INGRESS;
+ } else {
+ hooknum = reg->hooknum;
+ }
+ static_key_slow_inc(&nf_hooks_needed[pf][hooknum]);
+#endif
+}
+
+static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf)
+{
+#ifdef CONFIG_JUMP_LABEL
+ int hooknum;
+
+ if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
+ pf = NFPROTO_NETDEV;
+ hooknum = NF_NETDEV_INGRESS;
+ } else {
+ hooknum = reg->hooknum;
+ }
+ static_key_slow_dec(&nf_hooks_needed[pf][hooknum]);
+#endif
+}
+
static int __nf_register_net_hook(struct net *net, int pf,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries *p, *new_hooks;
struct nf_hook_entries __rcu **pp;
+ int err;
- if (pf == NFPROTO_NETDEV) {
-#ifndef CONFIG_NETFILTER_INGRESS
- if (reg->hooknum == NF_NETDEV_INGRESS)
- return -EOPNOTSUPP;
-#endif
- if (reg->hooknum != NF_NETDEV_INGRESS ||
- !reg->dev || dev_net(reg->dev) != net)
- return -EINVAL;
+ switch (pf) {
+ case NFPROTO_NETDEV:
+ err = nf_ingress_check(net, reg, NF_NETDEV_INGRESS);
+ if (err < 0)
+ return err;
+ break;
+ case NFPROTO_INET:
+ if (reg->hooknum != NF_INET_INGRESS)
+ break;
+
+ err = nf_ingress_check(net, reg, NF_INET_INGRESS);
+ if (err < 0)
+ return err;
+ break;
}
pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
@@ -345,12 +415,11 @@ static int __nf_register_net_hook(struct net *net, int pf,
hooks_validate(new_hooks);
#ifdef CONFIG_NETFILTER_INGRESS
- if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ if (nf_ingress_hook(reg, pf))
net_inc_ingress_queue();
#endif
-#ifdef CONFIG_JUMP_LABEL
- static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
-#endif
+ nf_static_key_inc(reg, pf);
+
BUG_ON(p == new_hooks);
nf_hook_entries_free(p);
return 0;
@@ -403,12 +472,10 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
if (nf_remove_net_hook(p, reg)) {
#ifdef CONFIG_NETFILTER_INGRESS
- if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ if (nf_ingress_hook(reg, pf))
net_dec_ingress_queue();
#endif
-#ifdef CONFIG_JUMP_LABEL
- static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
-#endif
+ nf_static_key_dec(reg, pf);
} else {
WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
}
@@ -425,8 +492,12 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
if (reg->pf == NFPROTO_INET) {
- __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
- __nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
+ if (reg->hooknum == NF_INET_INGRESS) {
+ __nf_unregister_net_hook(net, NFPROTO_INET, reg);
+ } else {
+ __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
+ __nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
+ }
} else {
__nf_unregister_net_hook(net, reg->pf, reg);
}
@@ -451,14 +522,20 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
int err;
if (reg->pf == NFPROTO_INET) {
- err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
- if (err < 0)
- return err;
-
- err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
- if (err < 0) {
- __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
- return err;
+ if (reg->hooknum == NF_INET_INGRESS) {
+ err = __nf_register_net_hook(net, NFPROTO_INET, reg);
+ if (err < 0)
+ return err;
+ } else {
+ err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
+ if (err < 0)
+ return err;
+
+ err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
+ if (err < 0) {
+ __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
+ return err;
+ }
}
} else {
err = __nf_register_net_hook(net, reg->pf, reg);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 920b7c4331f0..89009c82a6b2 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -250,22 +250,7 @@ EXPORT_SYMBOL_GPL(ip_set_type_unregister);
void *
ip_set_alloc(size_t size)
{
- void *members = NULL;
-
- if (size < KMALLOC_MAX_SIZE)
- members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
-
- if (members) {
- pr_debug("%p: allocated with kmalloc\n", members);
- return members;
- }
-
- members = vzalloc(size);
- if (!members)
- return NULL;
- pr_debug("%p: allocated with vmalloc\n", members);
-
- return members;
+ return kvzalloc(size, GFP_KERNEL_ACCOUNT);
}
EXPORT_SYMBOL_GPL(ip_set_alloc);
@@ -286,8 +271,7 @@ flag_nested(const struct nlattr *nla)
static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
[IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 },
- [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY,
- .len = sizeof(struct in6_addr) },
+ [IPSET_ATTR_IPADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
};
int
@@ -652,13 +636,14 @@ ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext,
if (SET_WITH_COUNTER(set)) {
struct ip_set_counter *counter = ext_counter(data, set);
+ ip_set_update_counter(counter, ext, flags);
+
if (flags & IPSET_FLAG_MATCH_COUNTERS &&
!(ip_set_match_counter(ip_set_get_packets(counter),
mext->packets, mext->packets_op) &&
ip_set_match_counter(ip_set_get_bytes(counter),
mext->bytes, mext->bytes_op)))
return false;
- ip_set_update_counter(counter, ext, flags);
}
if (SET_WITH_SKBINFO(set))
ip_set_get_skbinfo(ext_skbinfo(data, set),
@@ -1124,6 +1109,8 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
ret = -IPSET_ERR_PROTOCOL;
goto put_out;
}
+ /* Set create flags depending on the type revision */
+ set->flags |= set->type->create_flags[revision];
ret = set->type->create(net, set, tb, flags);
if (ret != 0)
@@ -1254,10 +1241,12 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
/* Modified by ip_set_destroy() only, which is serialized */
inst->is_destroyed = false;
} else {
+ u32 flags = flag_exist(nlh);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
if (!s) {
- ret = -ENOENT;
+ if (!(flags & IPSET_FLAG_EXIST))
+ ret = -ENOENT;
goto out;
} else if (s->ref || s->ref_netlink) {
ret = -IPSET_ERR_BUSY;
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 521e970be402..6186358eac7c 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -37,18 +37,18 @@
*/
/* Number of elements to store in an initial array block */
-#define AHASH_INIT_SIZE 4
+#define AHASH_INIT_SIZE 2
/* Max number of elements to store in an array block */
-#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE)
+#define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE)
/* Max muber of elements in the array block when tuned */
#define AHASH_MAX_TUNED 64
+#define AHASH_MAX(h) ((h)->bucketsize)
+
/* Max number of elements can be tuned */
#ifdef IP_SET_HASH_WITH_MULTI
-#define AHASH_MAX(h) ((h)->ahash_max)
-
static u8
-tune_ahash_max(u8 curr, u32 multi)
+tune_bucketsize(u8 curr, u32 multi)
{
u32 n;
@@ -61,12 +61,10 @@ tune_ahash_max(u8 curr, u32 multi)
*/
return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
}
-
-#define TUNE_AHASH_MAX(h, multi) \
- ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
+#define TUNE_BUCKETSIZE(h, multi) \
+ ((h)->bucketsize = tune_bucketsize((h)->bucketsize, multi))
#else
-#define AHASH_MAX(h) AHASH_MAX_SIZE
-#define TUNE_AHASH_MAX(h, multi)
+#define TUNE_BUCKETSIZE(h, multi)
#endif
/* A hash bucket */
@@ -143,20 +141,6 @@ htable_size(u8 hbits)
return hsize * sizeof(struct hbucket *) + sizeof(struct htable);
}
-/* Compute htable_bits from the user input parameter hashsize */
-static u8
-htable_bits(u32 hashsize)
-{
- /* Assume that hashsize == 2^htable_bits */
- u8 bits = fls(hashsize - 1);
-
- if (jhash_size(bits) != hashsize)
- /* Round up to the first 2^n value */
- bits = fls(hashsize);
-
- return bits;
-}
-
#ifdef IP_SET_HASH_WITH_NETS
#if IPSET_NET_COUNT > 1
#define __CIDR(cidr, i) (cidr[i])
@@ -321,9 +305,7 @@ struct htype {
#ifdef IP_SET_HASH_WITH_MARKMASK
u32 markmask; /* markmask value for mark mask to store */
#endif
-#ifdef IP_SET_HASH_WITH_MULTI
- u8 ahash_max; /* max elements in an array block */
-#endif
+ u8 bucketsize; /* max elements in an array block */
#ifdef IP_SET_HASH_WITH_NETMASK
u8 netmask; /* netmask value for subnets to store */
#endif
@@ -644,7 +626,7 @@ mtype_resize(struct ip_set *set, bool retried)
struct htype *h = set->data;
struct htable *t, *orig;
u8 htable_bits;
- size_t dsize = set->dsize;
+ size_t hsize, dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
struct mtype_elem *tmp;
@@ -668,14 +650,12 @@ mtype_resize(struct ip_set *set, bool retried)
retry:
ret = 0;
htable_bits++;
- if (!htable_bits) {
- /* In case we have plenty of memory :-) */
- pr_warn("Cannot increase the hashsize of set %s further\n",
- set->name);
- ret = -IPSET_ERR_HASH_FULL;
- goto out;
- }
- t = ip_set_alloc(htable_size(htable_bits));
+ if (!htable_bits)
+ goto hbwarn;
+ hsize = htable_size(htable_bits);
+ if (!hsize)
+ goto hbwarn;
+ t = ip_set_alloc(hsize);
if (!t) {
ret = -ENOMEM;
goto out;
@@ -817,6 +797,12 @@ cleanup:
if (ret == -EAGAIN)
goto retry;
goto out;
+
+hbwarn:
+ /* In case we have plenty of memory :-) */
+ pr_warn("Cannot increase the hashsize of set %s further\n", set->name);
+ ret = -IPSET_ERR_HASH_FULL;
+ goto out;
}
/* Get the current number of elements and ext_size in the set */
@@ -950,7 +936,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
- TUNE_AHASH_MAX(h, multi);
+ TUNE_BUCKETSIZE(h, multi);
if (n->size >= AHASH_MAX(h)) {
/* Trigger rehashing */
mtype_data_next(&h->next, d);
@@ -1305,6 +1291,11 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
goto nla_put_failure;
#endif
+ if (set->flags & IPSET_CREATE_FLAG_BUCKETSIZE) {
+ if (nla_put_u8(skb, IPSET_ATTR_BUCKETSIZE, h->bucketsize) ||
+ nla_put_net32(skb, IPSET_ATTR_INITVAL, htonl(h->initval)))
+ goto nla_put_failure;
+ }
if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
@@ -1520,7 +1511,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
if (!h)
return -ENOMEM;
- hbits = htable_bits(hashsize);
+ /* Compute htable_bits from the user input parameter hashsize.
+ * Assume that hashsize == 2^htable_bits,
+ * otherwise round up to the first 2^n value.
+ */
+ hbits = fls(hashsize - 1);
hsize = htable_size(hbits);
if (hsize == 0) {
kfree(h);
@@ -1547,8 +1542,20 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#ifdef IP_SET_HASH_WITH_MARKMASK
h->markmask = markmask;
#endif
- get_random_bytes(&h->initval, sizeof(h->initval));
-
+ if (tb[IPSET_ATTR_INITVAL])
+ h->initval = ntohl(nla_get_be32(tb[IPSET_ATTR_INITVAL]));
+ else
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->bucketsize = AHASH_MAX_SIZE;
+ if (tb[IPSET_ATTR_BUCKETSIZE]) {
+ h->bucketsize = nla_get_u8(tb[IPSET_ATTR_BUCKETSIZE]);
+ if (h->bucketsize < AHASH_INIT_SIZE)
+ h->bucketsize = AHASH_INIT_SIZE;
+ else if (h->bucketsize > AHASH_MAX_SIZE)
+ h->bucketsize = AHASH_MAX_SIZE;
+ else if (h->bucketsize % 2)
+ h->bucketsize += 1;
+ }
t->htable_bits = hbits;
t->maxelem = h->maxelem / ahash_numof_locks(hbits);
RCU_INIT_POINTER(h->table, t);
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 5d6d68eaf6a9..d1bef23fd4f5 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -23,7 +23,8 @@
/* 1 Counters support */
/* 2 Comments support */
/* 3 Forceadd support */
-#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */
+/* 4 skbinfo support */
+#define IPSET_TYPE_REV_MAX 5 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -277,11 +278,13 @@ static struct ip_set_type hash_ip_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ip_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c
index eceb7bc4a93a..467c59a83c0a 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmac.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmac.c
@@ -23,7 +23,7 @@
#include <linux/netfilter/ipset/ip_set_hash.h>
#define IPSET_TYPE_REV_MIN 0
-#define IPSET_TYPE_REV_MAX 0
+#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>");
@@ -268,11 +268,13 @@ static struct ip_set_type hash_ipmac_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipmac_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index aba1df617d6e..18346d18aa16 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -21,7 +21,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 1 Forceadd support */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */
+/* 2 skbinfo support */
+#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>");
@@ -274,12 +275,14 @@ static struct ip_set_type hash_ipmark_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipmark_create,
.create_policy = {
[IPSET_ATTR_MARKMASK] = { .type = NLA_U32 },
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 1ff228717e29..e1ca11196515 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -25,7 +25,8 @@
/* 2 Counters support added */
/* 3 Comments support added */
/* 4 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
+/* 5 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -341,11 +342,13 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipport_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index fa88afd812fa..ab179e064597 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -25,7 +25,8 @@
/* 2 Counters support added */
/* 3 Comments support added */
/* 4 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
+/* 5 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -356,11 +357,13 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipportip_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index eef6ecfcb409..8f075b44cf64 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -27,7 +27,8 @@
/* 4 Counters support added */
/* 5 Comments support added */
/* 6 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
+/* 7 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -513,11 +514,13 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipportnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 0b61593165ef..718814730acf 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -16,7 +16,7 @@
#include <linux/netfilter/ipset/ip_set_hash.h>
#define IPSET_TYPE_REV_MIN 0
-#define IPSET_TYPE_REV_MAX 0
+#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -125,11 +125,13 @@ static struct ip_set_type hash_mac_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_mac_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 136cf0781d3a..c1a11f041ac6 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -24,7 +24,8 @@
/* 3 Counters support added */
/* 4 Comments support added */
/* 5 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */
+/* 6 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 7 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -354,11 +355,13 @@ static struct ip_set_type hash_net_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_net_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index be5e95a0d876..ddd51c2e1cb3 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -26,7 +26,8 @@
/* 4 Comments support added */
/* 5 Forceadd support added */
/* 6 skbinfo support added */
-#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */
+/* 7 interface wildcard support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -225,7 +226,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
- nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
+ nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -442,7 +443,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
ip6_netmask(&e.ip, e.cidr);
- nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
+ nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -470,11 +471,13 @@ static struct ip_set_type hash_netiface_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netiface_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index da4ef910b12d..6532f0505e66 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -22,7 +22,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 1 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
+/* 2 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -459,11 +460,13 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 34448df80fb9..ec1564a1cb5a 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -26,7 +26,8 @@
/* 4 Counters support added */
/* 5 Comments support added */
/* 6 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
+/* 7 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -460,11 +461,13 @@ static struct ip_set_type hash_netport_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netport_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 934c1712cba8..0e91d1e82f1c 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -23,7 +23,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 0 Comments support added */
/* 1 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
+/* 2 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -558,11 +559,13 @@ static struct ip_set_type hash_netportnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netportnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 2c1593089ede..eb0e329f9b8d 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -29,7 +29,6 @@ if IP_VS
config IP_VS_IPV6
bool "IPv6 support for IPVS"
depends on IPV6 = y || IP_VS = IPV6
- select IP6_NF_IPTABLES
select NF_DEFRAG_IPV6
help
Add IPv6 support to IPVS.
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index a90b8eac16ac..c100c6b112c8 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -402,6 +402,8 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
{
unsigned int hash;
struct ip_vs_conn *cp, *ret=NULL;
+ const union nf_inet_addr *saddr;
+ __be16 sport;
/*
* Check for "full" addressed entries
@@ -411,10 +413,20 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
rcu_read_lock();
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
- if (p->vport == cp->cport && p->cport == cp->dport &&
- cp->af == p->af &&
+ if (p->vport != cp->cport)
+ continue;
+
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ sport = cp->vport;
+ saddr = &cp->vaddr;
+ } else {
+ sport = cp->dport;
+ saddr = &cp->daddr;
+ }
+
+ if (p->cport == sport && cp->af == p->af &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
- ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+ ip_vs_addr_equal(p->af, p->caddr, saddr) &&
p->protocol == cp->protocol &&
cp->ipvs == p->ipvs) {
if (!__ip_vs_conn_get(cp))
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index e3668a6e54e4..54e086c65721 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -742,12 +742,12 @@ static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
struct dst_entry *dst = skb_dst(skb);
if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
- ip6_route_me_harder(ipvs->net, skb) != 0)
+ ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0)
return 1;
} else
#endif
if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
- ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
+ ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0)
return 1;
return 0;
@@ -875,7 +875,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
unsigned int verdict = NF_DROP;
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
- goto ignore_cp;
+ goto after_nat;
/* Ensure the checksum is correct */
if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
@@ -901,6 +901,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
goto out;
+after_nat:
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
@@ -909,8 +910,6 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 0);
-
-ignore_cp:
verdict = NF_ACCEPT;
out:
@@ -1276,6 +1275,9 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
{
struct ip_vs_protocol *pp = pd->pp;
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ goto after_nat;
+
IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
if (skb_ensure_writable(skb, iph->len))
@@ -1316,6 +1318,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");
+after_nat:
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
@@ -1412,11 +1415,8 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
ipvs, af, skb, &iph);
- if (likely(cp)) {
- if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
- goto ignore_cp;
+ if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
- }
/* Check for real-server-started requests */
if (atomic_read(&ipvs->conn_out_counter)) {
@@ -1475,14 +1475,9 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
}
}
-out:
IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
"ip_vs_out: packet continues traversal as normal");
return NF_ACCEPT;
-
-ignore_cp:
- __ip_vs_conn_put(cp);
- goto out;
}
/*
@@ -2142,7 +2137,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
pkts = sysctl_sync_threshold(ipvs);
else
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, pkts);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 678c5b14841c..d45dbcba8b49 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2508,6 +2508,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
/* Set timeout values for (tcp tcpfin udp) */
ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
+ } else if (!len) {
+ /* No more commands with len == 0 below */
+ ret = -EINVAL;
+ goto out_unlock;
}
usvc_compat = (struct ip_vs_service_user *)arg;
@@ -2584,9 +2588,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
break;
case IP_VS_SO_SET_DELDEST:
ret = ip_vs_del_dest(svc, &udest);
- break;
- default:
- ret = -EINVAL;
}
out_unlock:
@@ -3892,7 +3893,7 @@ out:
}
-static const struct genl_ops ip_vs_genl_ops[] = {
+static const struct genl_small_ops ip_vs_genl_ops[] = {
{
.cmd = IPVS_CMD_NEW_SERVICE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -4000,8 +4001,8 @@ static struct genl_family ip_vs_genl_family __ro_after_init = {
.policy = ip_vs_cmd_policy,
.netnsok = true, /* Make ipvsadm to work on netns */
.module = THIS_MODULE,
- .ops = ip_vs_genl_ops,
- .n_ops = ARRAY_SIZE(ip_vs_genl_ops),
+ .small_ops = ip_vs_genl_ops,
+ .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops),
};
static int __init ip_vs_genl_register(void)
@@ -4166,12 +4167,18 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
spin_lock_init(&ipvs->tot_stats.lock);
- proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops,
- sizeof(struct ip_vs_iter));
- proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
- ip_vs_stats_show, NULL);
- proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
- ip_vs_stats_percpu_show, NULL);
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
+ &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)))
+ goto err_vs;
+ if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
+ ip_vs_stats_show, NULL))
+ goto err_stats;
+ if (!proc_create_net_single("ip_vs_stats_percpu", 0,
+ ipvs->net->proc_net,
+ ip_vs_stats_percpu_show, NULL))
+ goto err_percpu;
+#endif
if (ip_vs_control_net_init_sysctl(ipvs))
goto err;
@@ -4179,6 +4186,17 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
return 0;
err:
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
+
+err_percpu:
+ remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
+
+err_stats:
+ remove_proc_entry("ip_vs", ipvs->net->proc_net);
+
+err_vs:
+#endif
free_percpu(ipvs->tot_stats.cpustats);
return -ENOMEM;
}
@@ -4187,9 +4205,11 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
ip_vs_trash_cleanup(ipvs);
ip_vs_control_net_cleanup_sysctl(ipvs);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
remove_proc_entry("ip_vs", ipvs->net->proc_net);
+#endif
free_percpu(ipvs->tot_stats.cpustats);
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index dc2e7da2742a..7da51390cea6 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -539,8 +539,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
if (new_state != cp->state) {
struct ip_vs_dest *dest = cp->dest;
- IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
- "%s:%d state: %s->%s conn->refcnt:%d\n",
+ IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d "
+ "d:%s:%d state: %s->%s conn->refcnt:%d\n",
pd->pp->name,
((state_off == TCP_DIR_OUTPUT) ?
"output " : "input "),
@@ -548,10 +548,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
th->fin ? 'F' : '.',
th->ack ? 'A' : '.',
th->rst ? 'R' : '.',
- IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
- ntohs(cp->dport),
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
+ ntohs(cp->dport),
tcp_state_name(cp->state),
tcp_state_name(new_state),
refcount_read(&cp->refcnt));
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 2b8abbfe018c..9d43277b8b4f 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -242,9 +242,6 @@ struct ip_vs_sync_thread_data {
| IPVS Sync Connection (1) |
*/
-#define SYNC_MESG_HEADER_LEN 4
-#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
-
/* Version 0 header */
struct ip_vs_sync_mesg_v0 {
__u8 nr_conns;
@@ -618,7 +615,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
cp = cp->control;
if (cp) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
else
pkts = sysctl_sync_threshold(ipvs);
ip_vs_sync_conn(ipvs, cp, pkts);
@@ -779,7 +776,7 @@ control:
if (!cp)
return;
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
else
pkts = sysctl_sync_threshold(ipvs);
goto sloop;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index b00866d777fe..d2e5a8f644b8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -609,6 +609,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
if (ret == NF_ACCEPT) {
nf_reset_ct(skb);
skb_forward_csum(skb);
+ if (skb->dev)
+ skb->tstamp = 0;
}
return ret;
}
@@ -649,6 +651,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
skb_forward_csum(skb);
+ if (skb->dev)
+ skb->tstamp = 0;
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
@@ -669,6 +673,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
ip_vs_drop_early_demux_sk(skb);
skb_forward_csum(skb);
+ if (skb->dev)
+ skb->tstamp = 0;
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5b97d233f89b..ff0168736f6e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -859,7 +859,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
out:
nf_conntrack_double_unlock(hash, reply_hash);
- NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
return -EEXIST;
}
@@ -909,6 +908,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
tstamp->start = ktime_get_real_ns();
}
+/* caller must hold locks to prevent concurrent changes */
static int __nf_ct_resolve_clash(struct sk_buff *skb,
struct nf_conntrack_tuple_hash *h)
{
@@ -922,23 +922,21 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,
if (nf_ct_is_dying(ct))
return NF_DROP;
- if (!atomic_inc_not_zero(&ct->ct_general.use))
- return NF_DROP;
-
if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
nf_ct_match(ct, loser_ct)) {
struct net *net = nf_ct_net(ct);
+ nf_conntrack_get(&ct->ct_general);
+
nf_ct_acct_merge(ct, ctinfo, loser_ct);
nf_ct_add_to_dying_list(loser_ct);
nf_conntrack_put(&loser_ct->ct_general);
nf_ct_set(skb, ct, ctinfo);
- NF_CT_STAT_INC(net, insert_failed);
+ NF_CT_STAT_INC(net, clash_resolve);
return NF_ACCEPT;
}
- nf_ct_put(ct);
return NF_DROP;
}
@@ -998,6 +996,8 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
&nf_conntrack_hash[repl_idx]);
+
+ NF_CT_STAT_INC(net, clash_resolve);
return NF_ACCEPT;
}
@@ -1027,10 +1027,10 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
*
* Failing that, the new, unconfirmed conntrack is still added to the table
* provided that the collision only occurs in the ORIGINAL direction.
- * The new entry will be added after the existing one in the hash list,
+ * The new entry will be added only in the non-clashing REPLY direction,
* so packets in the ORIGINAL direction will continue to match the existing
* entry. The new entry will also have a fixed timeout so it expires --
- * due to the collision, it will not see bidirectional traffic.
+ * due to the collision, it will only see reply traffic.
*
* Returns NF_DROP if the clash could not be resolved.
*/
@@ -1229,7 +1229,8 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
* Let nf_ct_resolve_clash() deal with this later.
*/
if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
+ nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
continue;
NF_CT_STAT_INC_ATOMIC(net, found);
@@ -1725,10 +1726,8 @@ nf_conntrack_handle_icmp(struct nf_conn *tmpl,
else
return NF_ACCEPT;
- if (ret <= 0) {
+ if (ret <= 0)
NF_CT_STAT_INC_ATOMIC(state->net, error);
- NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- }
return ret;
}
@@ -1802,10 +1801,8 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
if (tmpl || ctinfo == IP_CT_UNTRACKED) {
/* Previously seen (loopback or untracked)? Ignore. */
if ((tmpl && !nf_ct_is_template(tmpl)) ||
- ctinfo == IP_CT_UNTRACKED) {
- NF_CT_STAT_INC_ATOMIC(state->net, ignore);
+ ctinfo == IP_CT_UNTRACKED)
return NF_ACCEPT;
- }
skb->_nfct = 0;
}
@@ -1813,7 +1810,6 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
if (dataoff <= 0) {
pr_debug("not prepared to track yet or error occurred\n");
- NF_CT_STAT_INC_ATOMIC(state->net, error);
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
ret = NF_ACCEPT;
goto out;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c3a4214dc958..84caf3316946 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -167,10 +167,14 @@ nla_put_failure:
return -1;
}
-static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct,
+ bool skip_zero)
{
long timeout = nf_ct_expires(ct) / HZ;
+ if (skip_zero && timeout == 0)
+ return 0;
+
if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
goto nla_put_failure;
return 0;
@@ -179,7 +183,8 @@ nla_put_failure:
return -1;
}
-static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
+static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct,
+ bool destroy)
{
const struct nf_conntrack_l4proto *l4proto;
struct nlattr *nest_proto;
@@ -193,7 +198,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
if (!nest_proto)
goto nla_put_failure;
- ret = l4proto->to_nlattr(skb, nest_proto, ct);
+ ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy);
nla_nest_end(skb, nest_proto);
@@ -537,8 +542,8 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
return -1;
if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) &&
- (ctnetlink_dump_timeout(skb, ct) < 0 ||
- ctnetlink_dump_protoinfo(skb, ct) < 0))
+ (ctnetlink_dump_timeout(skb, ct, false) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct, false) < 0))
return -1;
return 0;
@@ -780,15 +785,19 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
if (events & (1 << IPCT_DESTROY)) {
+ if (ctnetlink_dump_timeout(skb, ct, true) < 0)
+ goto nla_put_failure;
+
if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
- ctnetlink_dump_timestamp(skb, ct) < 0)
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct, true) < 0)
goto nla_put_failure;
} else {
- if (ctnetlink_dump_timeout(skb, ct) < 0)
+ if (ctnetlink_dump_timeout(skb, ct, false) < 0)
goto nla_put_failure;
- if (events & (1 << IPCT_PROTOINFO)
- && ctnetlink_dump_protoinfo(skb, ct) < 0)
+ if (events & (1 << IPCT_PROTOINFO) &&
+ ctnetlink_dump_protoinfo(skb, ct, false) < 0)
goto nla_put_failure;
if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
@@ -2497,7 +2506,6 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
- nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
htonl(st->insert_failed)) ||
@@ -2505,7 +2513,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) ||
nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) ||
nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
- htonl(st->search_restart)))
+ htonl(st->search_restart)) ||
+ nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE,
+ htonl(st->clash_resolve)))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -2719,10 +2729,10 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
if (ctnetlink_dump_status(skb, ct) < 0)
goto nla_put_failure;
- if (ctnetlink_dump_timeout(skb, ct) < 0)
+ if (ctnetlink_dump_timeout(skb, ct, false) < 0)
goto nla_put_failure;
- if (ctnetlink_dump_protoinfo(skb, ct) < 0)
+ if (ctnetlink_dump_protoinfo(skb, ct, false) < 0)
goto nla_put_failure;
if (ctnetlink_dump_helpinfo(skb, ct) < 0)
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index b3f4a334f9d7..db7479db8512 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -589,7 +589,7 @@ static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool destroy)
{
struct nlattr *nest_parms;
@@ -597,15 +597,22 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP);
if (!nest_parms)
goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) ||
- nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
+ if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state))
+ goto nla_put_failure;
+
+ if (destroy)
+ goto skip_state;
+
+ if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
cpu_to_be64(ct->proto.dccp.handshake_seq),
CTA_PROTOINFO_DCCP_PAD))
goto nla_put_failure;
+skip_state:
nla_nest_end(skb, nest_parms);
spin_unlock_bh(&ct->lock);
+
return 0;
nla_put_failure:
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 810cca24b399..fb8dc02e502f 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -543,7 +543,7 @@ static bool sctp_can_early_drop(const struct nf_conn *ct)
#include <linux/netfilter/nfnetlink_conntrack.h>
static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool destroy)
{
struct nlattr *nest_parms;
@@ -552,15 +552,20 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (!nest_parms)
goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) ||
- nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
+ if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state))
+ goto nla_put_failure;
+
+ if (destroy)
+ goto skip_state;
+
+ if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) ||
nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY,
ct->proto.sctp.vtag[IP_CT_DIR_REPLY]))
goto nla_put_failure;
+skip_state:
spin_unlock_bh(&ct->lock);
-
nla_nest_end(skb, nest_parms);
return 0;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index e8c86ee4c1c4..1d7e1c595546 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -541,13 +541,20 @@ static bool tcp_in_window(const struct nf_conn *ct,
swin = win << sender->td_scale;
sender->td_maxwin = (swin == 0 ? 1 : swin);
sender->td_maxend = end + sender->td_maxwin;
- /*
- * We haven't seen traffic in the other direction yet
- * but we have to tweak window tracking to pass III
- * and IV until that happens.
- */
- if (receiver->td_maxwin == 0)
+ if (receiver->td_maxwin == 0) {
+ /* We haven't seen traffic in the other
+ * direction yet but we have to tweak window
+ * tracking to pass III and IV until that
+ * happens.
+ */
receiver->td_end = receiver->td_maxend = sack;
+ } else if (sack == receiver->td_end + 1) {
+ /* Likely a reply to a keepalive.
+ * Needed for III.
+ */
+ receiver->td_end++;
+ }
+
}
} else if (((state->state == TCP_CONNTRACK_SYN_SENT
&& dir == IP_CT_DIR_ORIGINAL)
@@ -827,12 +834,6 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
return true;
}
-static bool nf_conntrack_tcp_established(const struct nf_conn *ct)
-{
- return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED &&
- test_bit(IPS_ASSURED_BIT, &ct->status);
-}
-
/* Returns verdict for packet, or -1 for invalid. */
int nf_conntrack_tcp_packet(struct nf_conn *ct,
struct sk_buff *skb,
@@ -1185,7 +1186,7 @@ static bool tcp_can_early_drop(const struct nf_conn *ct)
#include <linux/netfilter/nfnetlink_conntrack.h>
static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool destroy)
{
struct nlattr *nest_parms;
struct nf_ct_tcp_flags tmp = {};
@@ -1195,8 +1196,13 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (!nest_parms)
goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) ||
- nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
+ goto nla_put_failure;
+
+ if (destroy)
+ goto skip_state;
+
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
ct->proto.tcp.seen[0].td_scale) ||
nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
ct->proto.tcp.seen[1].td_scale))
@@ -1211,8 +1217,8 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
sizeof(struct nf_ct_tcp_flags), &tmp))
goto nla_put_failure;
+skip_state:
spin_unlock_bh(&ct->lock);
-
nla_nest_end(skb, nest_parms);
return 0;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index a604f43e3e6b..0ee702d374b0 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -428,18 +428,18 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
const struct ip_conntrack_stat *st = v;
if (v == SEQ_START_TOKEN) {
- seq_puts(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
+ seq_puts(seq, "entries clashres found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
return 0;
}
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
nr_conntracks,
- 0,
+ st->clash_resolve,
st->found,
0,
st->invalid,
- st->ignore,
+ 0,
0,
0,
st->insert,
@@ -523,6 +523,9 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
{
int ret;
+ /* module_param hashsize could have changed value */
+ nf_conntrack_htable_size_user = nf_conntrack_htable_size;
+
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret < 0 || !write)
return ret;
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 2b01a151eaa8..a579e59ee5c5 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -19,6 +19,7 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
skb_push(skb, skb->mac_len);
skb->dev = dev;
+ skb->tstamp = 0;
dev_queue_xmit(skb);
}
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 4f7a567c536e..4a4acbba78ff 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -395,12 +395,11 @@ static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
return -1;
tcph = (void *)(skb_network_header(skb) + thoff);
- inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
+ inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
return 0;
}
@@ -410,14 +409,13 @@ static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
return -1;
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace2(&udph->check, skb, port,
- new_port, true);
+ new_port, false);
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
@@ -449,8 +447,7 @@ int nf_flow_snat_port(const struct flow_offload *flow,
struct flow_ports *hdr;
__be16 port, new_port;
- if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
- skb_try_make_writable(skb, thoff + sizeof(*hdr)))
+ if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
return -1;
hdr = (void *)(skb_network_header(skb) + thoff);
@@ -481,8 +478,7 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
struct flow_ports *hdr;
__be16 port, new_port;
- if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
- skb_try_make_writable(skb, thoff + sizeof(*hdr)))
+ if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
return -1;
hdr = (void *)(skb_network_header(skb) + thoff);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index a3bca758b849..a698dbe28ef5 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -25,9 +25,6 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
if (proto != IPPROTO_TCP)
return 0;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
if (unlikely(tcph->fin || tcph->rst)) {
flow_offload_teardown(flow);
@@ -42,8 +39,7 @@ static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
return -1;
tcph = (void *)(skb_network_header(skb) + thoff);
@@ -57,8 +53,7 @@ static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
return -1;
udph = (void *)(skb_network_header(skb) + thoff);
@@ -167,8 +162,8 @@ static bool ip_has_options(unsigned int thoff)
static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
struct flow_offload_tuple *tuple)
{
+ unsigned int thoff, hdrsize;
struct flow_ports *ports;
- unsigned int thoff;
struct iphdr *iph;
if (!pskb_may_pull(skb, sizeof(*iph)))
@@ -181,15 +176,22 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
unlikely(ip_has_options(thoff)))
return -1;
- if (iph->protocol != IPPROTO_TCP &&
- iph->protocol != IPPROTO_UDP)
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ hdrsize = sizeof(struct tcphdr);
+ break;
+ case IPPROTO_UDP:
+ hdrsize = sizeof(struct udphdr);
+ break;
+ default:
return -1;
+ }
if (iph->ttl <= 1)
return -1;
thoff = iph->ihl * 4;
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ if (!pskb_may_pull(skb, thoff + hdrsize))
return -1;
iph = ip_hdr(skb);
@@ -315,8 +317,7 @@ static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
return -1;
tcph = (void *)(skb_network_header(skb) + thoff);
@@ -332,8 +333,7 @@ static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
return -1;
udph = (void *)(skb_network_header(skb) + thoff);
@@ -439,24 +439,31 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow,
static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
struct flow_offload_tuple *tuple)
{
+ unsigned int thoff, hdrsize;
struct flow_ports *ports;
struct ipv6hdr *ip6h;
- unsigned int thoff;
if (!pskb_may_pull(skb, sizeof(*ip6h)))
return -1;
ip6h = ipv6_hdr(skb);
- if (ip6h->nexthdr != IPPROTO_TCP &&
- ip6h->nexthdr != IPPROTO_UDP)
+ switch (ip6h->nexthdr) {
+ case IPPROTO_TCP:
+ hdrsize = sizeof(struct tcphdr);
+ break;
+ case IPPROTO_UDP:
+ hdrsize = sizeof(struct udphdr);
+ break;
+ default:
return -1;
+ }
if (ip6h->hop_limit <= 1)
return -1;
thoff = sizeof(*ip6h);
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ if (!pskb_may_pull(skb, thoff + hdrsize))
return -1;
ip6h = ipv6_hdr(skb);
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index ae5628ddbe6d..fd7c5f0f5c25 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -171,6 +171,18 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
}
EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
+void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb)
+{
+ u16 vid;
+
+ if (!skb_vlan_tag_present(skb))
+ return;
+
+ vid = skb_vlan_tag_get(skb);
+ nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid);
+}
+EXPORT_SYMBOL_GPL(nf_log_dump_vlan);
+
/* bridge and netdev logging families share this code. */
void nf_log_l2packet(struct net *net, u_int8_t pf,
__be16 protocol,
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index ea923f8cf9c4..b7c3c902290f 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1174,6 +1174,7 @@ static int __init nf_nat_init(void)
ret = register_pernet_subsys(&nat_net_ops);
if (ret < 0) {
nf_ct_extend_unregister(&nat_extend);
+ kvfree(nf_nat_bysource);
return ret;
}
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 59151dc07fdc..e87b6bd6b3cd 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -715,7 +715,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -953,7 +953,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
&ct->tuplehash[!dir].tuple.src.u3)) {
- err = nf_ip6_route_me_harder(state->net, skb);
+ err = nf_ip6_route_me_harder(state->net, state->sk, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 9cca35d22927..b100c04a0e43 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -446,7 +446,7 @@ synproxy_send_tcp(struct net *net,
skb_dst_set_noref(nskb, skb_dst(skb));
nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC))
goto free_nskb;
if (nfct) {
@@ -849,7 +849,7 @@ synproxy_send_tcp_ipv6(struct net *net,
fl6.fl6_sport = nth->source;
fl6.fl6_dport = nth->dest;
security_skb_classify_flow((struct sk_buff *)skb,
- flowi6_to_flowi(&fl6));
+ flowi6_to_flowi_common(&fl6));
err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
if (err) {
goto free_nskb;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 4603b667973a..8ee9f40cc0ea 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -206,7 +206,7 @@ static int nf_tables_register_hook(struct net *net,
if (basechain->type->ops_register)
return basechain->type->ops_register(net, ops);
- if (table->family == NFPROTO_NETDEV)
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
return nft_netdev_register_hooks(net, &basechain->hook_list);
return nf_register_net_hook(net, &basechain->ops);
@@ -228,7 +228,7 @@ static void nf_tables_unregister_hook(struct net *net,
if (basechain->type->ops_unregister)
return basechain->type->ops_unregister(net, ops);
- if (table->family == NFPROTO_NETDEV)
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
nft_netdev_unregister_hooks(net, &basechain->hook_list);
else
nf_unregister_net_hook(net, &basechain->ops);
@@ -302,7 +302,7 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx,
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
if (expr->ops->activate)
expr->ops->activate(ctx, expr);
@@ -317,7 +317,7 @@ static void nft_rule_expr_deactivate(const struct nft_ctx *ctx,
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
if (expr->ops->deactivate)
expr->ops->deactivate(ctx, expr, phase);
@@ -581,7 +581,8 @@ struct nft_module_request {
};
#ifdef CONFIG_MODULES
-static int nft_request_module(struct net *net, const char *fmt, ...)
+static __printf(2, 3) int nft_request_module(struct net *net, const char *fmt,
+ ...)
{
char module_name[MODULE_NAME_LEN];
struct nft_module_request *req;
@@ -619,7 +620,8 @@ static int nft_request_module(struct net *net, const char *fmt, ...)
static void lockdep_nfnl_nft_mutex_not_held(void)
{
#ifdef CONFIG_PROVE_LOCKING
- WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ if (debug_locks)
+ WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
#endif
}
@@ -650,6 +652,8 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
.len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_TABLE_FLAGS] = { .type = NLA_U32 },
[NFTA_TABLE_HANDLE] = { .type = NLA_U64 },
+ [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN }
};
static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
@@ -676,6 +680,11 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
NFTA_TABLE_PAD))
goto nla_put_failure;
+ if (table->udata) {
+ if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata))
+ goto nla_put_failure;
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -988,8 +997,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
int family = nfmsg->nfgen_family;
const struct nlattr *attr;
struct nft_table *table;
- u32 flags = 0;
struct nft_ctx ctx;
+ u32 flags = 0;
int err;
lockdep_assert_held(&net->nft.commit_mutex);
@@ -1025,6 +1034,14 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
if (table->name == NULL)
goto err_strdup;
+ if (nla[NFTA_TABLE_USERDATA]) {
+ table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL);
+ if (table->udata == NULL)
+ goto err_table_udata;
+
+ table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]);
+ }
+
err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
if (err)
goto err_chain_ht;
@@ -1047,6 +1064,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
err_trans:
rhltable_destroy(&table->chains_ht);
err_chain_ht:
+ kfree(table->udata);
+err_table_udata:
kfree(table->name);
err_strdup:
kfree(table);
@@ -1202,6 +1221,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
rhltable_destroy(&ctx->table->chains_ht);
kfree(ctx->table->name);
+ kfree(ctx->table->udata);
kfree(ctx->table);
}
@@ -1263,7 +1283,7 @@ static struct nft_chain *nft_chain_lookup(struct net *net,
if (nla == NULL)
return ERR_PTR(-EINVAL);
- nla_strlcpy(search, nla, sizeof(search));
+ nla_strscpy(search, nla, sizeof(search));
WARN_ON(!rcu_read_lock_held() &&
!lockdep_commit_lock_is_held(net));
@@ -1297,6 +1317,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
[NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED },
[NFTA_CHAIN_FLAGS] = { .type = NLA_U32 },
[NFTA_CHAIN_ID] = { .type = NLA_U32 },
+ [NFTA_CHAIN_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
};
static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -1361,7 +1383,7 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
goto nla_put_failure;
- if (family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(family, ops->hooknum)) {
nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
list_for_each_entry(hook, &basechain->hook_list, list) {
if (!first)
@@ -1438,6 +1460,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
goto nla_put_failure;
+ if (chain->udata &&
+ nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -1661,7 +1687,7 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx)
if (nft_is_base_chain(chain)) {
struct nft_base_chain *basechain = nft_base_chain(chain);
- if (ctx->family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
list_for_each_entry_safe(hook, next,
&basechain->hook_list, list) {
list_del_rcu(&hook->list);
@@ -1674,9 +1700,11 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx)
free_percpu(rcu_dereference_raw(basechain->stats));
}
kfree(chain->name);
+ kfree(chain->udata);
kfree(basechain);
} else {
kfree(chain->name);
+ kfree(chain->udata);
kfree(chain);
}
}
@@ -1695,7 +1723,11 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
goto err_hook_alloc;
}
- nla_strlcpy(ifname, attr, IFNAMSIZ);
+ nla_strscpy(ifname, attr, IFNAMSIZ);
+ /* nf_tables_netdev_event() is called under rtnl_mutex, this is
+ * indirectly serializing all the other holders of the commit_mutex with
+ * the rtnl_mutex.
+ */
dev = __dev_get_by_name(net, ifname);
if (!dev) {
err = -ENOENT;
@@ -1838,7 +1870,7 @@ static int nft_chain_parse_hook(struct net *net,
if (IS_ERR(type))
return PTR_ERR(type);
}
- if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
+ if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
return -EOPNOTSUPP;
if (type->type == NFT_CHAIN_T_NAT &&
@@ -1851,7 +1883,7 @@ static int nft_chain_parse_hook(struct net *net,
hook->type = type;
INIT_LIST_HEAD(&hook->list);
- if (family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(family, hook->num)) {
err = nft_chain_parse_netdev(net, ha, &hook->list);
if (err < 0) {
module_put(type->owner);
@@ -1918,7 +1950,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
INIT_LIST_HEAD(&basechain->hook_list);
chain = &basechain->chain;
- if (family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(family, hook->num)) {
list_splice_init(&hook->list, &basechain->hook_list);
list_for_each_entry(h, &basechain->hook_list, list)
nft_basechain_hook_init(&h->ops, family, hook, chain);
@@ -2030,7 +2062,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
} else {
if (!(flags & NFT_CHAIN_BINDING)) {
err = -EINVAL;
- goto err1;
+ goto err_destroy_chain;
}
snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
@@ -2039,13 +2071,22 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
if (!chain->name) {
err = -ENOMEM;
- goto err1;
+ goto err_destroy_chain;
+ }
+
+ if (nla[NFTA_CHAIN_USERDATA]) {
+ chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL);
+ if (chain->udata == NULL) {
+ err = -ENOMEM;
+ goto err_destroy_chain;
+ }
+ chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]);
}
rules = nf_tables_chain_alloc_rules(chain, 0);
if (!rules) {
err = -ENOMEM;
- goto err1;
+ goto err_destroy_chain;
}
*rules = NULL;
@@ -2054,12 +2095,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
err = nf_tables_register_hook(net, table, chain);
if (err < 0)
- goto err1;
+ goto err_destroy_chain;
trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- goto err2;
+ goto err_unregister_hook;
}
nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET;
@@ -2069,15 +2110,15 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
err = nft_chain_add(table, chain);
if (err < 0) {
nft_trans_destroy(trans);
- goto err2;
+ goto err_unregister_hook;
}
table->use++;
return 0;
-err2:
+err_unregister_hook:
nf_tables_unregister_hook(net, table, chain);
-err1:
+err_destroy_chain:
nf_tables_chain_destroy(ctx);
return err;
@@ -2103,7 +2144,8 @@ static bool nft_hook_list_equal(struct list_head *hook_list1,
}
static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
- u32 flags)
+ u32 flags, const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
@@ -2119,9 +2161,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
return -EOPNOTSUPP;
if (nla[NFTA_CHAIN_HOOK]) {
- if (!nft_is_base_chain(chain))
+ if (!nft_is_base_chain(chain)) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
-
+ }
err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
false);
if (err < 0)
@@ -2130,13 +2173,15 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
basechain = nft_base_chain(chain);
if (basechain->type != hook.type) {
nft_chain_release_hook(&hook);
+ NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
- if (ctx->family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(ctx->family, hook.num)) {
if (!nft_hook_list_equal(&basechain->hook_list,
&hook.list)) {
nft_chain_release_hook(&hook);
+ NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
} else {
@@ -2144,6 +2189,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
if (ops->hooknum != hook.num ||
ops->priority != hook.priority) {
nft_chain_release_hook(&hook);
+ NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
}
@@ -2156,8 +2202,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
chain2 = nft_chain_lookup(ctx->net, table,
nla[NFTA_CHAIN_NAME], genmask);
- if (!IS_ERR(chain2))
+ if (!IS_ERR(chain2)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
return -EEXIST;
+ }
}
if (nla[NFTA_CHAIN_COUNTERS]) {
@@ -2200,6 +2248,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
nft_trans_chain_update(tmp) &&
nft_trans_chain_name(tmp) &&
strcmp(name, nft_trans_chain_name(tmp)) == 0) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
kfree(name);
goto err;
}
@@ -2322,7 +2371,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
return -EOPNOTSUPP;
flags |= chain->flags & NFT_CHAIN_BASE;
- return nf_tables_updchain(&ctx, genmask, policy, flags);
+ return nf_tables_updchain(&ctx, genmask, policy, flags, attr,
+ extack);
}
return nf_tables_addchain(&ctx, family, genmask, policy, flags);
@@ -3036,7 +3086,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
* is called on error from nf_tables_newrule().
*/
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
next = nft_expr_next(expr);
nf_tables_expr_destroy(ctx, expr);
expr = next;
@@ -3521,6 +3571,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_SET_HANDLE] = { .type = NLA_U64 },
[NFTA_SET_EXPR] = { .type = NLA_NESTED },
+ [NFTA_SET_EXPRESSIONS] = { .type = NLA_NESTED },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -3674,7 +3725,7 @@ cont:
return 0;
}
-static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
+int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
{
u64 ms = be64_to_cpu(nla_get_be64(nla));
u64 max = (u64)(~((u64)0));
@@ -3688,7 +3739,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
return 0;
}
-static __be64 nf_jiffies64_to_msecs(u64 input)
+__be64 nf_jiffies64_to_msecs(u64 input)
{
return cpu_to_be64(jiffies64_to_msecs(input));
}
@@ -3728,6 +3779,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
u32 portid = ctx->portid;
struct nlattr *nest;
u32 seq = ctx->seq;
+ int i;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
@@ -3796,12 +3848,23 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
nla_nest_end(skb, nest);
- if (set->expr) {
+ if (set->num_exprs == 1) {
nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
- if (nf_tables_fill_expr_info(skb, set->expr) < 0)
+ if (nf_tables_fill_expr_info(skb, set->exprs[0]) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
+ } else if (set->num_exprs > 1) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < set->num_exprs; i++) {
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM,
+ set->exprs[i]) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
}
nlmsg_end(skb, nlh);
@@ -4099,7 +4162,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
NFT_SET_MAP | NFT_SET_EVAL |
- NFT_SET_OBJECT | NFT_SET_CONCAT))
+ NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR))
return -EOPNOTSUPP;
/* Only one of these operations is supported */
if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) ==
@@ -4170,7 +4233,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
return err;
}
- if (nla[NFTA_SET_EXPR])
+ if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS])
desc.expr = true;
table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
@@ -4234,6 +4297,35 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
err = PTR_ERR(expr);
goto err_set_alloc_name;
}
+ set->exprs[0] = expr;
+ set->num_exprs++;
+ } else if (nla[NFTA_SET_EXPRESSIONS]) {
+ struct nft_expr *expr;
+ struct nlattr *tmp;
+ int left;
+
+ if (!(flags & NFT_SET_EXPR)) {
+ err = -EINVAL;
+ goto err_set_alloc_name;
+ }
+ i = 0;
+ nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX) {
+ err = -E2BIG;
+ goto err_set_init;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_set_init;
+ }
+ expr = nft_set_elem_expr_alloc(&ctx, set, tmp);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_init;
+ }
+ set->exprs[i++] = expr;
+ set->num_exprs++;
+ }
}
udata = NULL;
@@ -4251,7 +4343,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
set->dtype = dtype;
set->objtype = objtype;
set->dlen = desc.dlen;
- set->expr = expr;
set->flags = flags;
set->size = desc.size;
set->policy = policy;
@@ -4280,8 +4371,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
err_set_trans:
ops->destroy(set);
err_set_init:
- if (expr)
- nft_expr_destroy(&ctx, expr);
+ for (i = 0; i < set->num_exprs; i++)
+ nft_expr_destroy(&ctx, set->exprs[i]);
err_set_alloc_name:
kfree(set->name);
err_set_name:
@@ -4291,11 +4382,13 @@ err_set_name:
static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
+ int i;
+
if (WARN_ON(set->use > 0))
return;
- if (set->expr)
- nft_expr_destroy(ctx, set->expr);
+ for (i = 0; i < set->num_exprs; i++)
+ nft_expr_destroy(ctx, set->exprs[i]);
set->ops->destroy(set);
kfree(set->name);
@@ -4448,8 +4541,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_DATA] = {
.align = __alignof__(u32),
},
- [NFT_SET_EXT_EXPR] = {
- .align = __alignof__(struct nft_expr),
+ [NFT_SET_EXT_EXPRESSIONS] = {
+ .align = __alignof__(struct nft_set_elem_expr),
},
[NFT_SET_EXT_OBJREF] = {
.len = sizeof(struct nft_object *),
@@ -4492,6 +4585,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
[NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING,
.len = NFT_OBJ_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_EXPRESSIONS] = { .type = NLA_NESTED },
};
static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
@@ -4525,6 +4619,43 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
return 0;
}
+static int nft_set_elem_expr_dump(struct sk_buff *skb,
+ const struct nft_set *set,
+ const struct nft_set_ext *ext)
+{
+ struct nft_set_elem_expr *elem_expr;
+ u32 size, num_exprs = 0;
+ struct nft_expr *expr;
+ struct nlattr *nest;
+
+ elem_expr = nft_set_ext_expr(ext);
+ nft_setelem_expr_foreach(expr, elem_expr, size)
+ num_exprs++;
+
+ if (num_exprs == 1) {
+ expr = nft_setelem_expr_at(elem_expr, 0);
+ if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr) < 0)
+ return -1;
+
+ return 0;
+ } else if (num_exprs > 1) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size) {
+ expr = nft_setelem_expr_at(elem_expr, size);
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
static int nf_tables_fill_setelem(struct sk_buff *skb,
const struct nft_set *set,
const struct nft_set_elem *elem)
@@ -4552,8 +4683,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
set->dlen) < 0)
goto nla_put_failure;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) &&
- nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0)
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) &&
+ nft_set_elem_expr_dump(skb, set, ext))
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
@@ -5048,8 +5179,8 @@ void *nft_set_elem_init(const struct nft_set *set,
return elem;
}
-static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
- struct nft_expr *expr)
+static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_expr *expr)
{
if (expr->ops->destroy_clone) {
expr->ops->destroy_clone(ctx, expr);
@@ -5059,6 +5190,16 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
}
}
+static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_set_elem_expr *elem_expr)
+{
+ struct nft_expr *expr;
+ u32 size;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size)
+ __nft_set_elem_expr_destroy(ctx, expr);
+}
+
void nft_set_elem_destroy(const struct nft_set *set, void *elem,
bool destroy_expr)
{
@@ -5071,7 +5212,7 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_release(nft_set_ext_data(ext), set->dtype);
- if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
+ if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext));
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
@@ -5088,32 +5229,72 @@ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
kfree(elem);
}
+int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_expr *expr_array[])
+{
+ struct nft_expr *expr;
+ int err, i, k;
+
+ for (i = 0; i < set->num_exprs; i++) {
+ expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL);
+ if (!expr)
+ goto err_expr;
+
+ err = nft_expr_clone(expr, set->exprs[i]);
+ if (err < 0) {
+ nft_expr_destroy(ctx, expr);
+ goto err_expr;
+ }
+ expr_array[i] = expr;
+ }
+
+ return 0;
+
+err_expr:
+ for (k = i - 1; k >= 0; k--)
+ nft_expr_destroy(ctx, expr_array[k]);
+
+ return -ENOMEM;
+}
+
+static void nft_set_elem_expr_setup(const struct nft_set_ext *ext, int i,
+ struct nft_expr *expr_array[])
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ struct nft_expr *expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
+
+ memcpy(expr, expr_array[i], expr_array[i]->ops->size);
+ elem_expr->size += expr_array[i]->ops->size;
+ kfree(expr_array[i]);
+ expr_array[i] = NULL;
+}
+
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{
+ struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {};
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
u8 genmask = nft_genmask_next(ctx->net);
+ u32 flags = 0, size = 0, num_exprs = 0;
struct nft_set_ext_tmpl tmpl;
struct nft_set_ext *ext, *ext2;
struct nft_set_elem elem;
struct nft_set_binding *binding;
struct nft_object *obj = NULL;
- struct nft_expr *expr = NULL;
struct nft_userdata *udata;
struct nft_data_desc desc;
enum nft_registers dreg;
struct nft_trans *trans;
- u32 flags = 0;
u64 timeout;
u64 expiration;
+ int err, i;
u8 ulen;
- int err;
err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
nft_set_elem_policy, NULL);
@@ -5146,7 +5327,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nla[NFTA_SET_ELEM_TIMEOUT] ||
nla[NFTA_SET_ELEM_EXPIRATION] ||
nla[NFTA_SET_ELEM_USERDATA] ||
- nla[NFTA_SET_ELEM_EXPR]))
+ nla[NFTA_SET_ELEM_EXPR] ||
+ nla[NFTA_SET_ELEM_EXPRESSIONS]))
return -EINVAL;
timeout = 0;
@@ -5171,23 +5353,64 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return err;
}
- if (nla[NFTA_SET_ELEM_EXPR] != NULL) {
+ if (nla[NFTA_SET_ELEM_EXPR]) {
+ struct nft_expr *expr;
+
+ if (set->num_exprs && set->num_exprs != 1)
+ return -EOPNOTSUPP;
+
expr = nft_set_elem_expr_alloc(ctx, set,
nla[NFTA_SET_ELEM_EXPR]);
if (IS_ERR(expr))
return PTR_ERR(expr);
- err = -EOPNOTSUPP;
- if (set->expr && set->expr->ops != expr->ops)
+ expr_array[0] = expr;
+ num_exprs = 1;
+
+ if (set->num_exprs && set->exprs[0]->ops != expr->ops) {
+ err = -EOPNOTSUPP;
goto err_set_elem_expr;
- } else if (set->expr) {
- expr = kzalloc(set->expr->ops->size, GFP_KERNEL);
- if (!expr)
- return -ENOMEM;
+ }
+ } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) {
+ struct nft_expr *expr;
+ struct nlattr *tmp;
+ int left;
+
+ i = 0;
+ nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX ||
+ (set->num_exprs && set->num_exprs == i)) {
+ err = -E2BIG;
+ goto err_set_elem_expr;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_set_elem_expr;
+ }
+ expr = nft_set_elem_expr_alloc(ctx, set, tmp);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_elem_expr;
+ }
+ expr_array[i] = expr;
+ num_exprs++;
- err = nft_expr_clone(expr, set->expr);
- if (err < 0)
+ if (set->num_exprs && expr->ops != set->exprs[i]->ops) {
+ err = -EOPNOTSUPP;
+ goto err_set_elem_expr;
+ }
+ i++;
+ }
+ if (set->num_exprs && set->num_exprs != i) {
+ err = -EOPNOTSUPP;
goto err_set_elem_expr;
+ }
+ } else if (set->num_exprs > 0) {
+ err = nft_set_elem_expr_clone(ctx, set, expr_array);
+ if (err < 0)
+ goto err_set_elem_expr_clone;
+
+ num_exprs = set->num_exprs;
}
err = nft_setelem_parse_key(ctx, set, &elem.key.val,
@@ -5212,9 +5435,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
}
- if (expr)
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPR,
- expr->ops->size);
+ if (num_exprs) {
+ for (i = 0; i < num_exprs; i++)
+ size += expr_array[i]->ops->size;
+
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
+ sizeof(struct nft_set_elem_expr) +
+ size);
+ }
if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
if (!(set->flags & NFT_SET_OBJECT)) {
@@ -5296,11 +5524,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
*nft_set_ext_obj(ext) = obj;
obj->use++;
}
- if (expr) {
- memcpy(nft_set_ext_expr(ext), expr, expr->ops->size);
- kfree(expr);
- expr = NULL;
- }
+ for (i = 0; i < num_exprs; i++)
+ nft_set_elem_expr_setup(ext, i, expr_array);
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
if (trans == NULL)
@@ -5361,9 +5586,9 @@ err_parse_key_end:
err_parse_key:
nft_data_release(&elem.key.val, NFT_DATA_VALUE);
err_set_elem_expr:
- if (expr != NULL)
- nft_expr_destroy(ctx, expr);
-
+ for (i = 0; i < num_exprs && expr_array[i]; i++)
+ nft_expr_destroy(ctx, expr_array[i]);
+err_set_elem_expr_clone:
return err;
}
@@ -5690,7 +5915,7 @@ struct nft_object *nft_obj_lookup(const struct net *net,
struct rhlist_head *tmp, *list;
struct nft_object *obj;
- nla_strlcpy(search, nla, sizeof(search));
+ nla_strscpy(search, nla, sizeof(search));
k.name = search;
WARN_ON_ONCE(!rcu_read_lock_held() &&
@@ -5737,6 +5962,8 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
[NFTA_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_OBJ_DATA] = { .type = NLA_NESTED },
[NFTA_OBJ_HANDLE] = { .type = NLA_U64},
+ [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
};
static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
@@ -5928,7 +6155,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
- goto err1;
+ goto err_init;
}
obj->key.table = table;
obj->handle = nf_tables_alloc_handle(table);
@@ -5936,32 +6163,42 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
if (!obj->key.name) {
err = -ENOMEM;
- goto err2;
+ goto err_strdup;
+ }
+
+ if (nla[NFTA_OBJ_USERDATA]) {
+ obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL);
+ if (obj->udata == NULL)
+ goto err_userdata;
+
+ obj->udlen = nla_len(nla[NFTA_OBJ_USERDATA]);
}
err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
if (err < 0)
- goto err3;
+ goto err_trans;
err = rhltable_insert(&nft_objname_ht, &obj->rhlhead,
nft_objname_ht_params);
if (err < 0)
- goto err4;
+ goto err_obj_ht;
list_add_tail_rcu(&obj->list, &table->objects);
table->use++;
return 0;
-err4:
+err_obj_ht:
/* queued in transaction log */
INIT_LIST_HEAD(&obj->list);
return err;
-err3:
+err_trans:
kfree(obj->key.name);
-err2:
+err_userdata:
+ kfree(obj->udata);
+err_strdup:
if (obj->ops->destroy)
obj->ops->destroy(&ctx, obj);
kfree(obj);
-err1:
+err_init:
module_put(type->owner);
return err;
}
@@ -5993,6 +6230,10 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
NFTA_OBJ_PAD))
goto nla_put_failure;
+ if (obj->udata &&
+ nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -6199,6 +6440,7 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
module_put(obj->ops->type->owner);
kfree(obj->key.name);
+ kfree(obj->udata);
kfree(obj);
}
@@ -7076,7 +7318,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
GFP_KERNEL);
kfree(buf);
- if (ctx->report &&
+ if (!ctx->report &&
!nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
return;
@@ -7198,7 +7440,7 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
audit_log_nfcfg("?:0;?:0", 0, net->nft.base_seq,
AUDIT_NFT_OP_GEN_REGISTER, GFP_KERNEL);
- if (nlmsg_report(nlh) &&
+ if (!nlmsg_report(nlh) &&
!nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
return;
@@ -7992,12 +8234,16 @@ static void nf_tables_abort_release(struct nft_trans *trans)
kfree(trans);
}
-static int __nf_tables_abort(struct net *net, bool autoload)
+static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
{
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
struct nft_hook *hook;
+ if (action == NFNL_ABORT_VALIDATE &&
+ nf_tables_validate(net) < 0)
+ return -EAGAIN;
+
list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list,
list) {
switch (trans->msg_type) {
@@ -8129,7 +8375,7 @@ static int __nf_tables_abort(struct net *net, bool autoload)
nf_tables_abort_release(trans);
}
- if (autoload)
+ if (action == NFNL_ABORT_AUTOLOAD)
nf_tables_module_autoload(net);
else
nf_tables_module_autoload_cleanup(net);
@@ -8142,9 +8388,10 @@ static void nf_tables_cleanup(struct net *net)
nft_validate_state_update(net, NFT_VALIDATE_SKIP);
}
-static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload)
+static int nf_tables_abort(struct net *net, struct sk_buff *skb,
+ enum nfnl_abort_action action)
{
- int ret = __nf_tables_abort(net, autoload);
+ int ret = __nf_tables_abort(net, action);
mutex_unlock(&net->nft.commit_mutex);
@@ -8704,6 +8951,17 @@ int __nft_release_basechain(struct nft_ctx *ctx)
}
EXPORT_SYMBOL_GPL(__nft_release_basechain);
+static void __nft_release_hooks(struct net *net)
+{
+ struct nft_table *table;
+ struct nft_chain *chain;
+
+ list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(chain, &table->chains, list)
+ nf_tables_unregister_hook(net, table, chain);
+ }
+}
+
static void __nft_release_tables(struct net *net)
{
struct nft_flowtable *flowtable, *nf;
@@ -8719,10 +8977,6 @@ static void __nft_release_tables(struct net *net)
list_for_each_entry_safe(table, nt, &net->nft.tables, list) {
ctx.family = table->family;
-
- list_for_each_entry(chain, &table->chains, list)
- nf_tables_unregister_hook(net, table, chain);
- /* No packets are walking on these chains anymore. */
ctx.table = table;
list_for_each_entry(chain, &table->chains, list) {
ctx.chain = chain;
@@ -8771,11 +9025,16 @@ static int __net_init nf_tables_init_net(struct net *net)
return 0;
}
+static void __net_exit nf_tables_pre_exit_net(struct net *net)
+{
+ __nft_release_hooks(net);
+}
+
static void __net_exit nf_tables_exit_net(struct net *net)
{
mutex_lock(&net->nft.commit_mutex);
if (!list_empty(&net->nft.commit_list))
- __nf_tables_abort(net, false);
+ __nf_tables_abort(net, NFNL_ABORT_NONE);
__nft_release_tables(net);
mutex_unlock(&net->nft.commit_mutex);
WARN_ON_ONCE(!list_empty(&net->nft.tables));
@@ -8784,8 +9043,9 @@ static void __net_exit nf_tables_exit_net(struct net *net)
}
static struct pernet_operations nf_tables_net_ops = {
- .init = nf_tables_init_net,
- .exit = nf_tables_exit_net,
+ .init = nf_tables_init_net,
+ .pre_exit = nf_tables_pre_exit_net,
+ .exit = nf_tables_exit_net,
};
static int __init nf_tables_module_init(void)
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 587897a2498b..dbc2e945c98e 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -47,13 +47,22 @@ static inline void nft_trace_packet(struct nft_traceinfo *info,
}
}
+static void nft_bitwise_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ u32 *src = &regs->data[priv->sreg];
+ u32 *dst = &regs->data[priv->dreg];
+
+ *dst = (*src & priv->mask) ^ priv->xor;
+}
+
static void nft_cmp_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
- u32 mask = nft_cmp_fast_mask(priv->len);
- if ((regs->data[priv->sreg] & mask) == priv->data)
+ if (((regs->data[priv->sreg] & priv->mask) == priv->data) ^ priv->inv)
return;
regs->verdict.code = NFT_BREAK;
}
@@ -176,6 +185,8 @@ next_rule:
nft_rule_for_each_expr(expr, last, rule) {
if (expr->ops == &nft_cmp_fast_ops)
nft_cmp_fast_eval(expr, &regs);
+ else if (expr->ops == &nft_bitwise_fast_ops)
+ nft_bitwise_fast_eval(expr, &regs);
else if (expr->ops != &nft_payload_fast_ops ||
!nft_payload_fast_eval(expr, &regs, pkt))
expr_call_ops_eval(expr, &regs, pkt);
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 9ef37c1b7b3b..9ae14270c543 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -28,6 +28,23 @@ static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions)
return flow;
}
+void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow,
+ enum flow_dissector_key_id addr_type)
+{
+ struct nft_flow_match *match = &flow->match;
+ struct nft_flow_key *mask = &match->mask;
+ struct nft_flow_key *key = &match->key;
+
+ if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL))
+ return;
+
+ key->control.addr_type = addr_type;
+ mask->control.addr_type = 0xffff;
+ match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] =
+ offsetof(struct nft_flow_key, control);
+}
+
struct nft_flow_rule *nft_flow_rule_create(struct net *net,
const struct nft_rule *rule)
{
@@ -37,7 +54,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr->ops && expr != nft_expr_last(rule)) {
+ while (nft_expr_more(rule, expr)) {
if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION)
num_actions++;
@@ -61,7 +78,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
ctx->net = net;
ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;
- while (expr->ops && expr != nft_expr_last(rule)) {
+ while (nft_expr_more(rule, expr)) {
if (!expr->ops->offload) {
err = -EOPNOTSUPP;
goto err_out;
@@ -323,8 +340,6 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain,
return nft_block_setup(basechain, &bo, cmd);
}
-#define FLOW_SETUP_BLOCK TC_SETUP_BLOCK
-
static int nft_chain_offload_cmd(struct nft_base_chain *basechain,
struct net_device *dev,
enum flow_block_command cmd)
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 3a2e64e13b22..d3df66a39b5e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -46,6 +46,23 @@ static struct {
const struct nfnetlink_subsystem __rcu *subsys;
} table[NFNL_SUBSYS_COUNT];
+static struct lock_class_key nfnl_lockdep_keys[NFNL_SUBSYS_COUNT];
+
+static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = {
+ [NFNL_SUBSYS_NONE] = "nfnl_subsys_none",
+ [NFNL_SUBSYS_CTNETLINK] = "nfnl_subsys_ctnetlink",
+ [NFNL_SUBSYS_CTNETLINK_EXP] = "nfnl_subsys_ctnetlink_exp",
+ [NFNL_SUBSYS_QUEUE] = "nfnl_subsys_queue",
+ [NFNL_SUBSYS_ULOG] = "nfnl_subsys_ulog",
+ [NFNL_SUBSYS_OSF] = "nfnl_subsys_osf",
+ [NFNL_SUBSYS_IPSET] = "nfnl_subsys_ipset",
+ [NFNL_SUBSYS_ACCT] = "nfnl_subsys_acct",
+ [NFNL_SUBSYS_CTNETLINK_TIMEOUT] = "nfnl_subsys_cttimeout",
+ [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper",
+ [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables",
+ [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat",
+};
+
static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK,
[NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK,
@@ -316,7 +333,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
return netlink_ack(skb, nlh, -EINVAL, NULL);
replay:
status = 0;
-
+replay_abort:
skb = netlink_skb_clone(oskb, GFP_KERNEL);
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM, NULL);
@@ -482,7 +499,7 @@ ack:
}
done:
if (status & NFNL_BATCH_REPLAY) {
- ss->abort(net, oskb, true);
+ ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD);
nfnl_err_reset(&err_list);
kfree_skb(skb);
module_put(ss->owner);
@@ -493,11 +510,25 @@ done:
status |= NFNL_BATCH_REPLAY;
goto done;
} else if (err) {
- ss->abort(net, oskb, false);
+ ss->abort(net, oskb, NFNL_ABORT_NONE);
netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
}
} else {
- ss->abort(net, oskb, false);
+ enum nfnl_abort_action abort_action;
+
+ if (status & NFNL_BATCH_FAILURE)
+ abort_action = NFNL_ABORT_NONE;
+ else
+ abort_action = NFNL_ABORT_VALIDATE;
+
+ err = ss->abort(net, oskb, abort_action);
+ if (err == -EAGAIN) {
+ nfnl_err_reset(&err_list);
+ kfree_skb(skb);
+ module_put(ss->owner);
+ status |= NFNL_BATCH_FAILURE;
+ goto replay_abort;
+ }
}
if (ss->cleanup)
ss->cleanup(net);
@@ -632,7 +663,7 @@ static int __init nfnetlink_init(void)
BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);
for (i=0; i<NFNL_SUBSYS_COUNT; i++)
- mutex_init(&table[i].mutex);
+ __mutex_init(&table[i].mutex, nfnl_lockdep_names[i], &nfnl_lockdep_keys[i]);
return register_pernet_subsys(&nfnetlink_net_ops);
}
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 5bfec829c12f..0fa1653b5f19 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -16,6 +16,7 @@
#include <linux/errno.h>
#include <net/netlink.h>
#include <net/sock.h>
+#include <net/netns/generic.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
@@ -41,6 +42,17 @@ struct nfacct_filter {
u32 mask;
};
+struct nfnl_acct_net {
+ struct list_head nfnl_acct_list;
+};
+
+static unsigned int nfnl_acct_net_id __read_mostly;
+
+static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_acct_net_id);
+}
+
#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES)
#define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */
@@ -49,6 +61,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
const struct nlattr * const tb[],
struct netlink_ext_ack *extack)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *nfacct, *matching = NULL;
char *acct_name;
unsigned int size = 0;
@@ -61,7 +74,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
if (strlen(acct_name) == 0)
return -EINVAL;
- list_for_each_entry(nfacct, &net->nfnl_acct_list, head) {
+ list_for_each_entry(nfacct, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
@@ -112,7 +125,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
nfacct->flags = flags;
}
- nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
+ nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
if (tb[NFACCT_BYTES]) {
atomic64_set(&nfacct->bytes,
@@ -123,7 +136,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
}
refcount_set(&nfacct->refcnt, 1);
- list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list);
+ list_add_tail_rcu(&nfacct->head, &nfnl_acct_net->nfnl_acct_list);
return 0;
}
@@ -188,6 +201,7 @@ static int
nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *last;
const struct nfacct_filter *filter = cb->data;
@@ -199,7 +213,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[1] = 0;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (last) {
if (cur != last)
continue;
@@ -269,6 +283,7 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
const struct nlattr * const tb[],
struct netlink_ext_ack *extack)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
int ret = -ENOENT;
struct nf_acct *cur;
char *acct_name;
@@ -288,7 +303,7 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
return -EINVAL;
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) {
struct sk_buff *skb2;
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
@@ -342,19 +357,20 @@ static int nfnl_acct_del(struct net *net, struct sock *nfnl,
const struct nlattr * const tb[],
struct netlink_ext_ack *extack)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *tmp;
int ret = -ENOENT;
char *acct_name;
if (!tb[NFACCT_NAME]) {
- list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head)
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head)
nfnl_acct_try_del(cur);
return 0;
}
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
@@ -402,10 +418,11 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *acct = NULL;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
continue;
@@ -488,16 +505,17 @@ EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
static int __net_init nfnl_acct_net_init(struct net *net)
{
- INIT_LIST_HEAD(&net->nfnl_acct_list);
+ INIT_LIST_HEAD(&nfnl_acct_pernet(net)->nfnl_acct_list);
return 0;
}
static void __net_exit nfnl_acct_net_exit(struct net *net)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *tmp;
- list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) {
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) {
list_del_rcu(&cur->head);
if (refcount_dec_and_test(&cur->refcnt))
@@ -508,6 +526,8 @@ static void __net_exit nfnl_acct_net_exit(struct net *net)
static struct pernet_operations nfnl_acct_ops = {
.init = nfnl_acct_net_init,
.exit = nfnl_acct_net_exit,
+ .id = &nfnl_acct_net_id,
+ .size = sizeof(struct nfnl_acct_net),
};
static int __init nfnl_acct_init(void)
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 5b0d0a77379c..0f94fce1d3ed 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -146,7 +146,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
!tb[NFCTH_POLICY_EXPECT_TIMEOUT])
return -EINVAL;
- nla_strlcpy(expect_policy->name,
+ nla_strscpy(expect_policy->name,
tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN);
expect_policy->max_expected =
ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
@@ -233,7 +233,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
if (ret < 0)
goto err1;
- nla_strlcpy(helper->name,
+ nla_strscpy(helper->name,
tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN);
size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
if (size > sizeof_field(struct nf_conn_help, data)) {
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index bc37d6c59db4..bbd773d74377 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -163,11 +163,6 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
u32 len;
int err;
- if (!tb[NFTA_BITWISE_SREG] ||
- !tb[NFTA_BITWISE_DREG] ||
- !tb[NFTA_BITWISE_LEN])
- return -EINVAL;
-
err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
if (err < 0)
return err;
@@ -292,9 +287,143 @@ static const struct nft_expr_ops nft_bitwise_ops = {
.offload = nft_bitwise_offload,
};
+static int
+nft_bitwise_extract_u32_data(const struct nlattr * const tb, u32 *out)
+{
+ struct nft_data_desc desc;
+ struct nft_data data;
+ int err = 0;
+
+ err = nft_data_init(NULL, &data, sizeof(data), &desc, tb);
+ if (err < 0)
+ return err;
+
+ if (desc.type != NFT_DATA_VALUE || desc.len != sizeof(u32)) {
+ err = -EINVAL;
+ goto err;
+ }
+ *out = data.data[0];
+err:
+ nft_data_release(&data, desc.type);
+ return err;
+}
+
+static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]);
+ err = nft_validate_register_load(priv->sreg, sizeof(u32));
+ if (err < 0)
+ return err;
+
+ priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]);
+ err = nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, sizeof(u32));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_BITWISE_DATA])
+ return -EINVAL;
+
+ if (!tb[NFTA_BITWISE_MASK] ||
+ !tb[NFTA_BITWISE_XOR])
+ return -EINVAL;
+
+ err = nft_bitwise_extract_u32_data(tb[NFTA_BITWISE_MASK], &priv->mask);
+ if (err < 0)
+ return err;
+
+ err = nft_bitwise_extract_u32_data(tb[NFTA_BITWISE_XOR], &priv->xor);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int
+nft_bitwise_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data data;
+
+ if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg))
+ return -1;
+ if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg))
+ return -1;
+ if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32))))
+ return -1;
+ if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_BOOL)))
+ return -1;
+
+ data.data[0] = priv->mask;
+ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+
+ data.data[0] = priv->xor;
+ if (nft_data_dump(skb, NFTA_BITWISE_XOR, &data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_bitwise_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ if (priv->xor || priv->sreg != priv->dreg || reg->len != sizeof(u32))
+ return -EOPNOTSUPP;
+
+ reg->mask.data[0] = priv->mask;
+ return 0;
+}
+
+const struct nft_expr_ops nft_bitwise_fast_ops = {
+ .type = &nft_bitwise_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_bitwise_fast_init,
+ .dump = nft_bitwise_fast_dump,
+ .offload = nft_bitwise_fast_offload,
+};
+
+static const struct nft_expr_ops *
+nft_bitwise_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ int err;
+ u32 len;
+
+ if (!tb[NFTA_BITWISE_LEN] ||
+ !tb[NFTA_BITWISE_SREG] ||
+ !tb[NFTA_BITWISE_DREG])
+ return ERR_PTR(-EINVAL);
+
+ err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (len != sizeof(u32))
+ return &nft_bitwise_ops;
+
+ if (tb[NFTA_BITWISE_OP] &&
+ ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_BOOL)
+ return &nft_bitwise_ops;
+
+ return &nft_bitwise_fast_ops;
+}
+
struct nft_expr_type nft_bitwise_type __read_mostly = {
.name = "bitwise",
- .ops = &nft_bitwise_ops,
+ .select_ops = nft_bitwise_select_ops,
.policy = nft_bitwise_policy,
.maxattr = NFTA_BITWISE_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index c78d01bc02e9..ff8528ad3dc6 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -161,16 +161,49 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
return nft_do_chain(&pkt, priv);
}
+static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_hook_state ingress_state = *state;
+ struct nft_pktinfo pkt;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ /* Original hook is NFPROTO_NETDEV and NF_NETDEV_INGRESS. */
+ ingress_state.pf = NFPROTO_IPV4;
+ ingress_state.hook = NF_INET_INGRESS;
+ nft_set_pktinfo(&pkt, skb, &ingress_state);
+
+ if (nft_set_pktinfo_ipv4_ingress(&pkt, skb) < 0)
+ return NF_DROP;
+ break;
+ case htons(ETH_P_IPV6):
+ ingress_state.pf = NFPROTO_IPV6;
+ ingress_state.hook = NF_INET_INGRESS;
+ nft_set_pktinfo(&pkt, skb, &ingress_state);
+
+ if (nft_set_pktinfo_ipv6_ingress(&pkt, skb) < 0)
+ return NF_DROP;
+ break;
+ default:
+ return NF_ACCEPT;
+ }
+
+ return nft_do_chain(&pkt, priv);
+}
+
static const struct nft_chain_type nft_chain_filter_inet = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
.family = NFPROTO_INET,
- .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ .hook_mask = (1 << NF_INET_INGRESS) |
+ (1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_FORWARD) |
(1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_POST_ROUTING),
.hooks = {
+ [NF_INET_INGRESS] = nft_do_chain_inet_ingress,
[NF_INET_LOCAL_IN] = nft_do_chain_inet,
[NF_INET_LOCAL_OUT] = nft_do_chain_inet,
[NF_INET_FORWARD] = nft_do_chain_inet,
diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c
index 8826bbe71136..edd02cda57fc 100644
--- a/net/netfilter/nft_chain_route.c
+++ b/net/netfilter/nft_chain_route.c
@@ -42,7 +42,7 @@ static unsigned int nf_route_table_hook4(void *priv,
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -92,7 +92,7 @@ static unsigned int nf_route_table_hook6(void *priv,
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
flowlabel != *((u32 *)ipv6_hdr(skb)))) {
- err = nf_ip6_route_me_harder(state->net, skb);
+ err = nf_ip6_route_me_harder(state->net, state->sk, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 16f4d84599ac..00e563a72d3d 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -123,11 +123,11 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
u8 *mask = (u8 *)&flow->match.mask;
u8 *key = (u8 *)&flow->match.key;
- if (priv->op != NFT_CMP_EQ || reg->len != priv->len)
+ if (priv->op != NFT_CMP_EQ || priv->len > reg->len)
return -EOPNOTSUPP;
- memcpy(key + reg->offset, &priv->data, priv->len);
- memcpy(mask + reg->offset, &reg->mask, priv->len);
+ memcpy(key + reg->offset, &priv->data, reg->len);
+ memcpy(mask + reg->offset, &reg->mask, reg->len);
flow->match.dissector.used_keys |= BIT(reg->key);
flow->match.dissector.offset[reg->key] = reg->base_offset;
@@ -137,7 +137,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
nft_reg_load16(priv->data.data) != ARPHRD_ETHER)
return -EOPNOTSUPP;
- nft_offload_update_dependency(ctx, &priv->data, priv->len);
+ nft_offload_update_dependency(ctx, &priv->data, reg->len);
return 0;
}
@@ -167,7 +167,6 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,
struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
struct nft_data_desc desc;
struct nft_data data;
- u32 mask;
int err;
err = nft_data_init(NULL, &data, sizeof(data), &desc,
@@ -181,10 +180,11 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,
return err;
desc.len *= BITS_PER_BYTE;
- mask = nft_cmp_fast_mask(desc.len);
- priv->data = data.data[0] & mask;
+ priv->mask = nft_cmp_fast_mask(desc.len);
+ priv->data = data.data[0] & priv->mask;
priv->len = desc.len;
+ priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ;
return 0;
}
@@ -201,7 +201,7 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
},
.sreg = priv->sreg,
.len = priv->len / BITS_PER_BYTE,
- .op = NFT_CMP_EQ,
+ .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ,
};
return __nft_cmp_offload(ctx, flow, &cmp);
@@ -210,11 +210,12 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
struct nft_data data;
if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ)))
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op)))
goto nla_put_failure;
data.data[0] = priv->data;
@@ -272,7 +273,7 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
goto err1;
}
- if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ)
+ if (desc.len <= sizeof(u32) && (op == NFT_CMP_EQ || op == NFT_CMP_NEQ))
return &nft_cmp_fast_ops;
return &nft_cmp_ops;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 322bd674963e..8bcd49f14797 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -177,8 +177,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
}
#endif
case NFT_CT_ID:
- if (!nf_ct_is_confirmed(ct))
- goto err;
*dest = nf_ct_get_id(ct);
return;
default:
@@ -990,7 +988,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
if (!priv->l4proto)
return -ENOENT;
- nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
+ nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
if (tb[NFTA_CT_HELPER_L3PROTO])
family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO]));
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 64ca13a1885b..d164ef9e6843 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -19,11 +19,31 @@ struct nft_dynset {
enum nft_registers sreg_key:8;
enum nft_registers sreg_data:8;
bool invert;
+ bool expr;
+ u8 num_exprs;
u64 timeout;
- struct nft_expr *expr;
+ struct nft_expr *expr_array[NFT_SET_EXPR_MAX];
struct nft_set_binding binding;
};
+static int nft_dynset_expr_setup(const struct nft_dynset *priv,
+ const struct nft_set_ext *ext)
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ struct nft_expr *expr;
+ int i;
+
+ for (i = 0; i < priv->num_exprs; i++) {
+ expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
+ if (nft_expr_clone(expr, priv->expr_array[i]) < 0)
+ return -1;
+
+ elem_expr->size += priv->expr_array[i]->ops->size;
+ }
+
+ return 0;
+}
+
static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
struct nft_regs *regs)
{
@@ -44,8 +64,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
goto err1;
ext = nft_set_elem_ext(set, elem);
- if (priv->expr != NULL &&
- nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0)
+ if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0)
goto err2;
return elem;
@@ -90,6 +109,41 @@ void nft_dynset_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
}
+static void nft_dynset_ext_add_expr(struct nft_dynset *priv)
+{
+ u8 size = 0;
+ int i;
+
+ for (i = 0; i < priv->num_exprs; i++)
+ size += priv->expr_array[i]->ops->size;
+
+ nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPRESSIONS,
+ sizeof(struct nft_set_elem_expr) + size);
+}
+
+static struct nft_expr *
+nft_dynset_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set,
+ const struct nlattr *attr, int pos)
+{
+ struct nft_expr *expr;
+ int err;
+
+ expr = nft_set_elem_expr_alloc(ctx, set, attr);
+ if (IS_ERR(expr))
+ return expr;
+
+ if (set->exprs[pos] && set->exprs[pos]->ops != expr->ops) {
+ err = -EOPNOTSUPP;
+ goto err_dynset_expr;
+ }
+
+ return expr;
+
+err_dynset_expr:
+ nft_expr_destroy(ctx, expr);
+ return ERR_PTR(err);
+}
+
static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
[NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
@@ -100,6 +154,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
[NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 },
[NFTA_DYNSET_EXPR] = { .type = NLA_NESTED },
[NFTA_DYNSET_FLAGS] = { .type = NLA_U32 },
+ [NFTA_DYNSET_EXPRESSIONS] = { .type = NLA_NESTED },
};
static int nft_dynset_init(const struct nft_ctx *ctx,
@@ -110,7 +165,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
u8 genmask = nft_genmask_next(ctx->net);
struct nft_set *set;
u64 timeout;
- int err;
+ int err, i;
lockdep_assert_held(&ctx->net->nft.commit_mutex);
@@ -121,11 +176,12 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (tb[NFTA_DYNSET_FLAGS]) {
u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS]));
-
- if (flags & ~NFT_DYNSET_F_INV)
- return -EINVAL;
+ if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR))
+ return -EOPNOTSUPP;
if (flags & NFT_DYNSET_F_INV)
priv->invert = true;
+ if (flags & NFT_DYNSET_F_EXPR)
+ priv->expr = true;
}
set = nft_set_lookup_global(ctx->net, ctx->table,
@@ -156,9 +212,11 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
timeout = 0;
if (tb[NFTA_DYNSET_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
- return -EINVAL;
- timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
- tb[NFTA_DYNSET_TIMEOUT])));
+ return -EOPNOTSUPP;
+
+ err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout);
+ if (err)
+ return err;
}
priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]);
@@ -168,7 +226,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (tb[NFTA_DYNSET_SREG_DATA] != NULL) {
if (!(set->flags & NFT_SET_MAP))
- return -EINVAL;
+ return -EOPNOTSUPP;
if (set->dtype == NFT_DATA_VERDICT)
return -EOPNOTSUPP;
@@ -179,31 +237,85 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
} else if (set->flags & NFT_SET_MAP)
return -EINVAL;
- if (tb[NFTA_DYNSET_EXPR] != NULL) {
- if (!(set->flags & NFT_SET_EVAL))
- return -EINVAL;
+ if ((tb[NFTA_DYNSET_EXPR] || tb[NFTA_DYNSET_EXPRESSIONS]) &&
+ !(set->flags & NFT_SET_EVAL))
+ return -EINVAL;
+
+ if (tb[NFTA_DYNSET_EXPR]) {
+ struct nft_expr *dynset_expr;
+
+ dynset_expr = nft_dynset_expr_alloc(ctx, set,
+ tb[NFTA_DYNSET_EXPR], 0);
+ if (IS_ERR(dynset_expr))
+ return PTR_ERR(dynset_expr);
+
+ priv->num_exprs++;
+ priv->expr_array[0] = dynset_expr;
+
+ if (set->num_exprs > 1 ||
+ (set->num_exprs == 1 &&
+ dynset_expr->ops != set->exprs[0]->ops)) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
+ } else if (tb[NFTA_DYNSET_EXPRESSIONS]) {
+ struct nft_expr *dynset_expr;
+ struct nlattr *tmp;
+ int left;
- priv->expr = nft_set_elem_expr_alloc(ctx, set,
- tb[NFTA_DYNSET_EXPR]);
- if (IS_ERR(priv->expr))
- return PTR_ERR(priv->expr);
+ if (!priv->expr)
+ return -EINVAL;
- if (set->expr && set->expr->ops != priv->expr->ops) {
+ i = 0;
+ nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX) {
+ err = -E2BIG;
+ goto err_expr_free;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_expr_free;
+ }
+ dynset_expr = nft_dynset_expr_alloc(ctx, set, tmp, i);
+ if (IS_ERR(dynset_expr)) {
+ err = PTR_ERR(dynset_expr);
+ goto err_expr_free;
+ }
+ priv->expr_array[i] = dynset_expr;
+ priv->num_exprs++;
+
+ if (set->num_exprs &&
+ dynset_expr->ops != set->exprs[i]->ops) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
+ i++;
+ }
+ if (set->num_exprs && set->num_exprs != i) {
err = -EOPNOTSUPP;
goto err_expr_free;
}
+ } else if (set->num_exprs > 0) {
+ err = nft_set_elem_expr_clone(ctx, set, priv->expr_array);
+ if (err < 0)
+ return err;
+
+ priv->num_exprs = set->num_exprs;
}
nft_set_ext_prepare(&priv->tmpl);
nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen);
if (set->flags & NFT_SET_MAP)
nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen);
- if (priv->expr != NULL)
- nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR,
- priv->expr->ops->size);
+
+ if (priv->num_exprs)
+ nft_dynset_ext_add_expr(priv);
+
if (set->flags & NFT_SET_TIMEOUT) {
- if (timeout || set->timeout)
+ if (timeout || set->timeout) {
+ nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT);
nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION);
+ }
}
priv->timeout = timeout;
@@ -219,8 +331,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
return 0;
err_expr_free:
- if (priv->expr != NULL)
- nft_expr_destroy(ctx, priv->expr);
+ for (i = 0; i < priv->num_exprs; i++)
+ nft_expr_destroy(ctx, priv->expr_array[i]);
return err;
}
@@ -245,9 +357,10 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
struct nft_dynset *priv = nft_expr_priv(expr);
+ int i;
- if (priv->expr != NULL)
- nft_expr_destroy(ctx, priv->expr);
+ for (i = 0; i < priv->num_exprs; i++)
+ nft_expr_destroy(ctx, priv->expr_array[i]);
nf_tables_destroy_set(ctx, priv->set);
}
@@ -256,6 +369,7 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
+ int i;
if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key))
goto nla_put_failure;
@@ -267,11 +381,29 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name))
goto nla_put_failure;
if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT,
- cpu_to_be64(jiffies_to_msecs(priv->timeout)),
+ nf_jiffies64_to_msecs(priv->timeout),
NFTA_DYNSET_PAD))
goto nla_put_failure;
- if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
- goto nla_put_failure;
+ if (priv->set->num_exprs == 0) {
+ if (priv->num_exprs == 1) {
+ if (nft_expr_dump(skb, NFTA_DYNSET_EXPR,
+ priv->expr_array[0]))
+ goto nla_put_failure;
+ } else if (priv->num_exprs > 1) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, NFTA_DYNSET_EXPRESSIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ for (i = 0; i < priv->num_exprs; i++) {
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM,
+ priv->expr_array[i]))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
+ }
if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags)))
goto nla_put_failure;
return 0;
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 3087e23297db..b77985986b24 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -138,6 +138,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
return;
skb->dev = dev;
+ skb->tstamp = 0;
neigh_xmit(neigh_table, dev, addr, skb);
out:
regs->verdict.code = verdict;
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 57899454a530..a06a46b039c5 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -152,7 +152,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
if (priv->prefix == NULL)
return -ENOMEM;
- nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1);
+ nla_strscpy(priv->prefix, nla, nla_len(nla) + 1);
} else {
priv->prefix = (char *)nft_log_null_prefix;
}
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index b37bd02448d8..bf4b3ad5314c 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -724,22 +724,22 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
switch (priv->key) {
case NFT_META_PROTOCOL:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto,
- sizeof(__u16), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto,
+ sizeof(__u16), reg);
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
case NFT_META_L4PROTO:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
- sizeof(__u8), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+ sizeof(__u8), reg);
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
break;
case NFT_META_IIF:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
- ingress_ifindex, sizeof(__u32), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_ifindex, sizeof(__u32), reg);
break;
case NFT_META_IIFTYPE:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
- ingress_iftype, sizeof(__u16), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_iftype, sizeof(__u16), reg);
break;
default:
return -EOPNOTSUPP;
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 7a2e59638499..47d4e0e21651 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -22,6 +22,7 @@
#include <linux/icmpv6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <net/sctp/checksum.h>
static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,
struct vlan_ethhdr *veth)
@@ -164,6 +165,34 @@ nla_put_failure:
return -1;
}
+static bool nft_payload_offload_mask(struct nft_offload_reg *reg,
+ u32 priv_len, u32 field_len)
+{
+ unsigned int remainder, delta, k;
+ struct nft_data mask = {};
+ __be32 remainder_mask;
+
+ if (priv_len == field_len) {
+ memset(&reg->mask, 0xff, priv_len);
+ return true;
+ } else if (priv_len > field_len) {
+ return false;
+ }
+
+ memset(&mask, 0xff, field_len);
+ remainder = priv_len % sizeof(u32);
+ if (remainder) {
+ k = priv_len / sizeof(u32);
+ delta = field_len - priv_len;
+ remainder_mask = htonl(~((1 << (delta * BITS_PER_BYTE)) - 1));
+ mask.data[k] = (__force u32)remainder_mask;
+ }
+
+ memcpy(&reg->mask, &mask, field_len);
+
+ return true;
+}
+
static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
struct nft_flow_rule *flow,
const struct nft_payload *priv)
@@ -172,21 +201,21 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct ethhdr, h_source):
- if (priv->len != ETH_ALEN)
+ if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
src, ETH_ALEN, reg);
break;
case offsetof(struct ethhdr, h_dest):
- if (priv->len != ETH_ALEN)
+ if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
dst, ETH_ALEN, reg);
break;
case offsetof(struct ethhdr, h_proto):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic,
@@ -194,14 +223,14 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
case offsetof(struct vlan_ethhdr, h_vlan_TCI):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
vlan_tci, sizeof(__be16), reg);
break;
case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
@@ -209,7 +238,7 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan,
@@ -217,7 +246,7 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
break;
case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) +
sizeof(struct vlan_hdr):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan,
@@ -238,21 +267,25 @@ static int nft_payload_offload_ip(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct iphdr, saddr):
- if (priv->len != sizeof(struct in_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src,
sizeof(struct in_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS);
break;
case offsetof(struct iphdr, daddr):
- if (priv->len != sizeof(struct in_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst,
sizeof(struct in_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS);
break;
case offsetof(struct iphdr, protocol):
- if (priv->len != sizeof(__u8))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
@@ -274,21 +307,25 @@ static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct ipv6hdr, saddr):
- if (priv->len != sizeof(struct in6_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in6_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src,
sizeof(struct in6_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS);
break;
case offsetof(struct ipv6hdr, daddr):
- if (priv->len != sizeof(struct in6_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in6_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst,
sizeof(struct in6_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS);
break;
case offsetof(struct ipv6hdr, nexthdr):
- if (priv->len != sizeof(__u8))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
@@ -330,14 +367,14 @@ static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct tcphdr, source):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
sizeof(__be16), reg);
break;
case offsetof(struct tcphdr, dest):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
@@ -358,14 +395,14 @@ static int nft_payload_offload_udp(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct udphdr, source):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
sizeof(__be16), reg);
break;
case offsetof(struct udphdr, dest):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
@@ -484,6 +521,19 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
return 0;
}
+static int nft_payload_csum_sctp(struct sk_buff *skb, int offset)
+{
+ struct sctphdr *sh;
+
+ if (skb_ensure_writable(skb, offset + sizeof(*sh)))
+ return -1;
+
+ sh = (struct sctphdr *)(skb->data + offset);
+ sh->checksum = sctp_compute_cksum(skb, offset);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ return 0;
+}
+
static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt,
struct sk_buff *skb,
__wsum fsum, __wsum tsum)
@@ -587,6 +637,13 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
skb_store_bits(skb, offset, src, priv->len) < 0)
goto err;
+ if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP &&
+ pkt->tprot == IPPROTO_SCTP &&
+ skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (nft_payload_csum_sctp(skb, pkt->xt.thoff))
+ goto err;
+ }
+
return;
err:
regs->verdict.code = NFT_BREAK;
@@ -623,6 +680,13 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
case NFT_PAYLOAD_CSUM_NONE:
case NFT_PAYLOAD_CSUM_INET:
break;
+ case NFT_PAYLOAD_CSUM_SCTP:
+ if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER)
+ return -EINVAL;
+
+ if (priv->csum_offset != offsetof(struct sctphdr, checksum))
+ return -EINVAL;
+ break;
default:
return -EOPNOTSUPP;
}
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 61fb7e8afbf0..927ff8459bd9 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -40,6 +40,7 @@ int nft_reject_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_reject *priv = nft_expr_priv(expr);
+ int icmp_code;
if (tb[NFTA_REJECT_TYPE] == NULL)
return -EINVAL;
@@ -47,9 +48,17 @@ int nft_reject_init(const struct nft_ctx *ctx,
priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
return -EINVAL;
- priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+
+ icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+ if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
+ icmp_code > NFT_REJECT_ICMPX_MAX)
+ return -EINVAL;
+
+ priv->icmp_code = icmp_code;
+ break;
case NFT_REJECT_TCP_RST:
break;
default:
@@ -69,6 +78,7 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
goto nla_put_failure;
break;
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index cf8f2646e93c..95090186ee90 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -28,7 +28,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset(nft_net(pkt), pkt->xt.state->sk,
+ pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach(pkt->skb,
@@ -44,7 +45,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset6(nft_net(pkt), pkt->xt.state->sk,
+ pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach6(nft_net(pkt), pkt->skb,
@@ -58,60 +60,16 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
regs->verdict.code = NF_DROP;
}
-static int nft_reject_inet_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_reject_inet_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
{
- struct nft_reject *priv = nft_expr_priv(expr);
- int icmp_code;
-
- if (tb[NFTA_REJECT_TYPE] == NULL)
- return -EINVAL;
-
- priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
- return -EINVAL;
-
- icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
- if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
- icmp_code > NFT_REJECT_ICMPX_MAX)
- return -EINVAL;
-
- priv->icmp_code = icmp_code;
- break;
- case NFT_REJECT_TCP_RST:
- break;
- default:
- return -EINVAL;
- }
- return 0;
-}
-
-static int nft_reject_inet_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
-{
- const struct nft_reject *priv = nft_expr_priv(expr);
-
- if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type)))
- goto nla_put_failure;
-
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
- goto nla_put_failure;
- break;
- default:
- break;
- }
-
- return 0;
-
-nla_put_failure:
- return -1;
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_INGRESS));
}
static struct nft_expr_type nft_reject_inet_type;
@@ -119,9 +77,9 @@ static const struct nft_expr_ops nft_reject_inet_ops = {
.type = &nft_reject_inet_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
.eval = nft_reject_inet_eval,
- .init = nft_reject_inet_init,
- .dump = nft_reject_inet_dump,
- .validate = nft_reject_validate,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_inet_validate,
};
static struct nft_expr_type nft_reject_inet_type __read_mostly = {
diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c
new file mode 100644
index 000000000000..d89f68754f42
--- /dev/null
+++ b/net/netfilter/nft_reject_netdev.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 Laura Garcia Liebana <nevola@gmail.com>
+ * Copyright (c) 2020 Jose M. Guisado <guigom@riseup.net>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+
+static void nft_reject_queue_xmit(struct sk_buff *nskb, struct sk_buff *oldskb)
+{
+ dev_hard_header(nskb, nskb->dev, ntohs(oldskb->protocol),
+ eth_hdr(oldskb)->h_source, eth_hdr(oldskb)->h_dest,
+ nskb->len);
+ dev_queue_xmit(nskb);
+}
+
+static void nft_reject_netdev_send_v4_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_send_v4_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_send_v6_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+
+static void nft_reject_netdev_send_v6_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct ethhdr *eth = eth_hdr(pkt->skb);
+ struct nft_reject *priv = nft_expr_priv(expr);
+ const unsigned char *dest = eth->h_dest;
+
+ if (is_broadcast_ether_addr(dest) ||
+ is_multicast_ether_addr(dest))
+ goto out;
+
+ switch (eth->h_proto) {
+ case htons(ETH_P_IP):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_netdev_send_v4_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ nft_reject_icmp_code(priv->icmp_code));
+ break;
+ }
+ break;
+ case htons(ETH_P_IPV6):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_netdev_send_v6_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ nft_reject_icmpv6_code(priv->icmp_code));
+ break;
+ }
+ break;
+ default:
+ /* No explicit way to reject this protocol, drop it. */
+ break;
+ }
+out:
+ regs->verdict.code = NF_DROP;
+}
+
+static int nft_reject_netdev_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS));
+}
+
+static struct nft_expr_type nft_reject_netdev_type;
+static const struct nft_expr_ops nft_reject_netdev_ops = {
+ .type = &nft_reject_netdev_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_netdev_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_netdev_validate,
+};
+
+static struct nft_expr_type nft_reject_netdev_type __read_mostly = {
+ .family = NFPROTO_NETDEV,
+ .name = "reject",
+ .ops = &nft_reject_netdev_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_netdev_module_init(void)
+{
+ return nft_register_expr(&nft_reject_netdev_type);
+}
+
+static void __exit nft_reject_netdev_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_netdev_type);
+}
+
+module_init(nft_reject_netdev_module_init);
+module_exit(nft_reject_netdev_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Laura Garcia Liebana <nevola@gmail.com>");
+MODULE_AUTHOR("Jose M. Guisado <guigom@riseup.net>");
+MODULE_DESCRIPTION("Reject packets from netdev via nftables");
+MODULE_ALIAS_NFT_AF_EXPR(5, "reject");
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 4d3f147e8d8d..bf618b7ec1ae 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -293,6 +293,22 @@ cont:
rhashtable_walk_exit(&hti);
}
+static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
+ struct nft_set_ext *ext)
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ struct nft_expr *expr;
+ u32 size;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size) {
+ if (expr->ops->gc &&
+ expr->ops->gc(read_pnet(&set->net), expr))
+ return true;
+ }
+
+ return false;
+}
+
static void nft_rhash_gc(struct work_struct *work)
{
struct nft_set *set;
@@ -314,16 +330,13 @@ static void nft_rhash_gc(struct work_struct *work)
continue;
}
- if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) {
- struct nft_expr *expr = nft_set_ext_expr(&he->ext);
+ if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) &&
+ nft_rhash_expr_needs_gc_run(set, &he->ext))
+ goto needs_gc_run;
- if (expr->ops->gc &&
- expr->ops->gc(read_pnet(&set->net), expr))
- goto gc;
- }
if (!nft_set_elem_expired(&he->ext))
continue;
-gc:
+needs_gc_run:
if (nft_set_elem_mark_busy(&he->ext))
continue;
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 637ce3e8c575..a28aca5124ce 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -14,6 +14,25 @@ struct nft_socket {
};
};
+static void nft_socket_wildcard(const struct nft_pktinfo *pkt,
+ struct nft_regs *regs, struct sock *sk,
+ u32 *dest)
+{
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ nft_reg_store8(dest, inet_sk(sk)->inet_rcv_saddr == 0);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ nft_reg_store8(dest, ipv6_addr_any(&sk->sk_v6_rcv_saddr));
+ break;
+#endif
+ default:
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+}
+
static void nft_socket_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -59,6 +78,13 @@ static void nft_socket_eval(const struct nft_expr *expr,
return;
}
break;
+ case NFT_SOCKET_WILDCARD:
+ if (!sk_fullsock(sk)) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ nft_socket_wildcard(pkt, regs, sk, dest);
+ break;
default:
WARN_ON(1);
regs->verdict.code = NFT_BREAK;
@@ -97,6 +123,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
switch(priv->key) {
case NFT_SOCKET_TRANSPARENT:
+ case NFT_SOCKET_WILDCARD:
len = sizeof(u8);
break;
case NFT_SOCKET_MARK:
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index cedf47ab3c6f..2182d361e273 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -191,8 +191,8 @@ static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry
skb->mark == rt_info->mark &&
iph->daddr == rt_info->daddr &&
iph->saddr == rt_info->saddr))
- return ip_route_me_harder(entry->state.net, skb,
- RTN_UNSPEC);
+ return ip_route_me_harder(entry->state.net, entry->state.sk,
+ skb, RTN_UNSPEC);
}
#endif
return 0;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index af22dbe85e2c..acce622582e3 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1349,6 +1349,14 @@ struct xt_counters *xt_counters_alloc(unsigned int counters)
}
EXPORT_SYMBOL(xt_counters_alloc);
+struct xt_table_info
+*xt_table_get_private_protected(const struct xt_table *table)
+{
+ return rcu_dereference_protected(table->private,
+ mutex_is_locked(&xt[table->af].mutex));
+}
+EXPORT_SYMBOL(xt_table_get_private_protected);
+
struct xt_table_info *
xt_replace_table(struct xt_table *table,
unsigned int num_counters,
@@ -1356,7 +1364,6 @@ xt_replace_table(struct xt_table *table,
int *error)
{
struct xt_table_info *private;
- unsigned int cpu;
int ret;
ret = xt_jumpstack_alloc(newinfo);
@@ -1366,47 +1373,20 @@ xt_replace_table(struct xt_table *table,
}
/* Do the substitution. */
- local_bh_disable();
- private = table->private;
+ private = xt_table_get_private_protected(table);
/* Check inside lock: is the old number correct? */
if (num_counters != private->number) {
pr_debug("num_counters != table->private->number (%u/%u)\n",
num_counters, private->number);
- local_bh_enable();
*error = -EAGAIN;
return NULL;
}
newinfo->initial_entries = private->initial_entries;
- /*
- * Ensure contents of newinfo are visible before assigning to
- * private.
- */
- smp_wmb();
- table->private = newinfo;
-
- /* make sure all cpus see new ->private value */
- smp_wmb();
- /*
- * Even though table entries have now been swapped, other CPU's
- * may still be using the old entries...
- */
- local_bh_enable();
-
- /* ... so wait for even xt_recseq on all cpus */
- for_each_possible_cpu(cpu) {
- seqcount_t *s = &per_cpu(xt_recseq, cpu);
- u32 seq = raw_read_seqcount(s);
-
- if (seq & 1) {
- do {
- cond_resched();
- cpu_relax();
- } while (seq == raw_read_seqcount(s));
- }
- }
+ rcu_assign_pointer(table->private, newinfo);
+ synchronize_rcu();
audit_log_nfcfg(table->name, table->af, private->number,
!private->number ? AUDIT_XT_OP_REGISTER :
@@ -1442,12 +1422,12 @@ struct xt_table *xt_register_table(struct net *net,
}
/* Simplifies replace_table code. */
- table->private = bootstrap;
+ rcu_assign_pointer(table->private, bootstrap);
if (!xt_replace_table(table, 0, newinfo, &ret))
goto unlock;
- private = table->private;
+ private = xt_table_get_private_protected(table);
pr_debug("table->private->number = %u\n", private->number);
/* save number of initial entries */
@@ -1470,7 +1450,8 @@ void *xt_unregister_table(struct xt_table *table)
struct xt_table_info *private;
mutex_lock(&xt[table->af].mutex);
- private = table->private;
+ private = xt_table_get_private_protected(table);
+ RCU_INIT_POINTER(table->private, NULL);
list_del(&table->list);
mutex_unlock(&xt[table->af].mutex);
audit_log_nfcfg(table->name, table->af, private->number,
diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c
index 713fb38541df..8928ec56c388 100644
--- a/net/netfilter/xt_HMARK.c
+++ b/net/netfilter/xt_HMARK.c
@@ -276,7 +276,7 @@ hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t,
return 0;
/* follow-up fragments don't contain ports, skip all fragments */
- if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+ if (ip_is_fragment(ip))
return 0;
hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info);
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 37253d399c6b..0d5c422f8745 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -115,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
} cfg;
int ret;
+ if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name))
+ return -ENAMETOOLONG;
+
net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
mutex_lock(&xn->hash_lock);
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index a97c2259bbc8..7c6bf1c16813 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -27,7 +27,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
overquota = nfnl_acct_overquota(xt_net(par), info->nfacct);
- return overquota == NFACCT_UNDERQUOTA ? false : true;
+ return overquota != NFACCT_UNDERQUOTA;
}
static int
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 606411869698..0446307516cd 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -152,7 +152,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
/*
* Drop entries with timestamps older then 'time'.
*/
-static void recent_entry_reap(struct recent_table *t, unsigned long time)
+static void recent_entry_reap(struct recent_table *t, unsigned long time,
+ struct recent_entry *working, bool update)
{
struct recent_entry *e;
@@ -162,6 +163,12 @@ static void recent_entry_reap(struct recent_table *t, unsigned long time)
e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
/*
+ * Do not reap the entry which are going to be updated.
+ */
+ if (e == working && update)
+ return;
+
+ /*
* The last time stamp is the most recent.
*/
if (time_after(time, e->stamps[e->index-1]))
@@ -303,7 +310,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
/* info->seconds must be non-zero */
if (info->check_set & XT_RECENT_REAP)
- recent_entry_reap(t, time);
+ recent_entry_reap(t, time, e,
+ info->check_set & XT_RECENT_UPDATE && ret);
}
if (info->check_set & XT_RECENT_SET ||
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 249da67d50a2..f28c8947c730 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -304,7 +304,7 @@ static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info)
/* NetLabel Generic NETLINK Command Definitions
*/
-static const struct genl_ops netlbl_calipso_ops[] = {
+static const struct genl_small_ops netlbl_calipso_ops[] = {
{
.cmd = NLBL_CALIPSO_C_ADD,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -342,8 +342,8 @@ static struct genl_family netlbl_calipso_gnl_family __ro_after_init = {
.maxattr = NLBL_CALIPSO_A_MAX,
.policy = calipso_genl_policy,
.module = THIS_MODULE,
- .ops = netlbl_calipso_ops,
- .n_ops = ARRAY_SIZE(netlbl_calipso_ops),
+ .small_ops = netlbl_calipso_ops,
+ .n_small_ops = ARRAY_SIZE(netlbl_calipso_ops),
};
/* NetLabel Generic NETLINK Protocol Functions
@@ -366,6 +366,7 @@ static const struct netlbl_calipso_ops *calipso_ops;
/**
* netlbl_calipso_ops_register - Register the CALIPSO operations
+ * @ops: ops to register
*
* Description:
* Register the CALIPSO packet engine operations.
@@ -426,7 +427,7 @@ void calipso_doi_free(struct calipso_doi *doi_def)
/**
* calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
* @doi: the DOI value
- * @audit_secid: the LSM secid to use in the audit message
+ * @audit_info: NetLabel audit information
*
* Description:
* Removes a DOI definition from the CALIPSO engine. The NetLabel routines will
@@ -595,7 +596,7 @@ int calipso_req_setattr(struct request_sock *req,
/**
* calipso_req_delattr - Delete the CALIPSO option from a request socket
- * @reg: the request socket
+ * @req: the request socket
*
* Description:
* Removes the CALIPSO option from a request socket, if present.
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 0f16080b87cb..726dda95934c 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -724,7 +724,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
* NetLabel Generic NETLINK Command Definitions
*/
-static const struct genl_ops netlbl_cipsov4_ops[] = {
+static const struct genl_small_ops netlbl_cipsov4_ops[] = {
{
.cmd = NLBL_CIPSOV4_C_ADD,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -762,8 +762,8 @@ static struct genl_family netlbl_cipsov4_gnl_family __ro_after_init = {
.maxattr = NLBL_CIPSOV4_A_MAX,
.policy = netlbl_cipsov4_genl_policy,
.module = THIS_MODULE,
- .ops = netlbl_cipsov4_ops,
- .n_ops = ARRAY_SIZE(netlbl_cipsov4_ops),
+ .small_ops = netlbl_cipsov4_ops,
+ .n_small_ops = ARRAY_SIZE(netlbl_cipsov4_ops),
};
/*
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index f73a8382c275..dc8c39f51f7d 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -612,9 +612,8 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
if (audit_buf != NULL) {
audit_log_format(audit_buf,
- " nlbl_domain=%s res=%u",
- entry->domain ? entry->domain : "(default)",
- ret_val == 0 ? 1 : 0);
+ " nlbl_domain=%s res=1",
+ entry->domain ? entry->domain : "(default)");
audit_log_end(audit_buf);
}
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index e7a25fbfaf8b..df1b41ed73fd 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -95,7 +95,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
ret_val = -ENOMEM;
goto add_free_entry;
}
- nla_strlcpy(entry->domain,
+ nla_strscpy(entry->domain,
info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size);
}
@@ -757,7 +757,7 @@ version_failure:
* NetLabel Generic NETLINK Command Definitions
*/
-static const struct genl_ops netlbl_mgmt_genl_ops[] = {
+static const struct genl_small_ops netlbl_mgmt_genl_ops[] = {
{
.cmd = NLBL_MGMT_C_ADD,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -823,8 +823,8 @@ static struct genl_family netlbl_mgmt_gnl_family __ro_after_init = {
.maxattr = NLBL_MGMT_A_MAX,
.policy = netlbl_mgmt_genl_policy,
.module = THIS_MODULE,
- .ops = netlbl_mgmt_genl_ops,
- .n_ops = ARRAY_SIZE(netlbl_mgmt_genl_ops),
+ .small_ops = netlbl_mgmt_genl_ops,
+ .n_small_ops = ARRAY_SIZE(netlbl_mgmt_genl_ops),
};
/*
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 77bb1bb22c3b..ccb491642811 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1166,12 +1166,13 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
struct netlbl_unlhsh_walk_arg cb_arg;
u32 skip_bkt = cb->args[0];
u32 skip_chain = cb->args[1];
- u32 iter_bkt;
- u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
+ u32 skip_addr4 = cb->args[2];
+ u32 iter_bkt, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
struct netlbl_unlhsh_iface *iface;
struct list_head *iter_list;
struct netlbl_af4list *addr4;
#if IS_ENABLED(CONFIG_IPV6)
+ u32 skip_addr6 = cb->args[3];
struct netlbl_af6list *addr6;
#endif
@@ -1182,7 +1183,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
rcu_read_lock();
for (iter_bkt = skip_bkt;
iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
- iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
+ iter_bkt++) {
iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt];
list_for_each_entry_rcu(iface, iter_list, list) {
if (!iface->valid ||
@@ -1190,7 +1191,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
continue;
netlbl_af4list_foreach_rcu(addr4,
&iface->addr4_list) {
- if (iter_addr4++ < cb->args[2])
+ if (iter_addr4++ < skip_addr4)
continue;
if (netlbl_unlabel_staticlist_gen(
NLBL_UNLABEL_C_STATICLIST,
@@ -1203,10 +1204,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
goto unlabel_staticlist_return;
}
}
+ iter_addr4 = 0;
+ skip_addr4 = 0;
#if IS_ENABLED(CONFIG_IPV6)
netlbl_af6list_foreach_rcu(addr6,
&iface->addr6_list) {
- if (iter_addr6++ < cb->args[3])
+ if (iter_addr6++ < skip_addr6)
continue;
if (netlbl_unlabel_staticlist_gen(
NLBL_UNLABEL_C_STATICLIST,
@@ -1219,8 +1222,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
goto unlabel_staticlist_return;
}
}
+ iter_addr6 = 0;
+ skip_addr6 = 0;
#endif /* IPv6 */
}
+ iter_chain = 0;
+ skip_chain = 0;
}
unlabel_staticlist_return:
@@ -1301,7 +1308,7 @@ unlabel_staticlistdef_return:
* NetLabel Generic NETLINK Command Definitions
*/
-static const struct genl_ops netlbl_unlabel_genl_ops[] = {
+static const struct genl_small_ops netlbl_unlabel_genl_ops[] = {
{
.cmd = NLBL_UNLABEL_C_STATICADD,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -1367,8 +1374,8 @@ static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = {
.maxattr = NLBL_UNLABEL_A_MAX,
.policy = netlbl_unlabel_genl_policy,
.module = THIS_MODULE,
- .ops = netlbl_unlabel_genl_ops,
- .n_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops),
+ .small_ops = netlbl_unlabel_genl_ops,
+ .n_small_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops),
};
/*
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d2d1448274f5..daca50d6bb12 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -848,7 +848,7 @@ retry:
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
- * message has has the capability @cap in the user namespace @user_ns.
+ * message has the capability @cap in the user namespace @user_ns.
*/
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
struct user_namespace *user_ns, int cap)
@@ -867,7 +867,7 @@ EXPORT_SYMBOL(__netlink_ns_capable);
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
- * message has has the capability @cap in the user namespace @user_ns.
+ * message has the capability @cap in the user namespace @user_ns.
*/
bool netlink_ns_capable(const struct sk_buff *skb,
struct user_namespace *user_ns, int cap)
@@ -883,7 +883,7 @@ EXPORT_SYMBOL(netlink_ns_capable);
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
- * message has has the capability @cap in all user namespaces.
+ * message has the capability @cap in all user namespaces.
*/
bool netlink_capable(const struct sk_buff *skb, int cap)
{
@@ -898,7 +898,7 @@ EXPORT_SYMBOL(netlink_capable);
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
- * message has has the capability @cap over the network namespace of
+ * message has the capability @cap over the network namespace of
* the socket we received the message from.
*/
bool netlink_net_capable(const struct sk_buff *skb, int cap)
@@ -1853,7 +1853,7 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
struct scm_cookie scm;
u32 netlink_skb_flags = 0;
- if (msg->msg_flags&MSG_OOB)
+ if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
err = scm_send(sock, msg, &scm, true);
@@ -1916,7 +1916,7 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
refcount_inc(&skb->users);
netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
}
- err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
+ err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT);
out:
scm_destroy(&scm);
@@ -1929,12 +1929,12 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
- int noblock = flags&MSG_DONTWAIT;
+ int noblock = flags & MSG_DONTWAIT;
size_t copied;
struct sk_buff *skb, *data_skb;
int err, ret;
- if (flags&MSG_OOB)
+ if (flags & MSG_OOB)
return -EOPNOTSUPP;
copied = 0;
@@ -2186,13 +2186,35 @@ EXPORT_SYMBOL(__nlmsg_put);
* It would be better to create kernel thread.
*/
+static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct netlink_ext_ack *extack)
+{
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno),
+ NLM_F_MULTI | cb->answer_flags);
+ if (WARN_ON(!nlh))
+ return -ENOBUFS;
+
+ nl_dump_check_consistent(cb, nlh);
+ memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno));
+
+ if (extack->_msg && nlk->flags & NETLINK_F_EXT_ACK) {
+ nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
+ if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg))
+ nlmsg_end(skb, nlh);
+ }
+
+ return 0;
+}
+
static int netlink_dump(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
struct netlink_ext_ack extack = {};
struct netlink_callback *cb;
struct sk_buff *skb = NULL;
- struct nlmsghdr *nlh;
struct module *module;
int err = -ENOBUFS;
int alloc_min_size;
@@ -2258,22 +2280,19 @@ static int netlink_dump(struct sock *sk)
return 0;
}
- nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE,
- sizeof(nlk->dump_done_errno),
- NLM_F_MULTI | cb->answer_flags);
- if (WARN_ON(!nlh))
+ if (netlink_dump_done(nlk, skb, cb, &extack))
goto errout_skb;
- nl_dump_check_consistent(cb, nlh);
-
- memcpy(nlmsg_data(nlh), &nlk->dump_done_errno,
- sizeof(nlk->dump_done_errno));
-
- if (extack._msg && nlk->flags & NETLINK_F_EXT_ACK) {
- nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
- if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack._msg))
- nlmsg_end(skb, nlh);
+#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
+ /* frag_list skb's data is used for compat tasks
+ * and the regular skb's data for normal (non-compat) tasks.
+ * See netlink_recvmsg().
+ */
+ if (unlikely(skb_shinfo(skb)->frag_list)) {
+ if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack))
+ goto errout_skb;
}
+#endif
if (sk_filter(sk, skb))
kfree_skb(skb);
@@ -2401,6 +2420,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
tlvlen += nla_total_size(sizeof(u32));
if (nlk_has_extack && extack && extack->cookie_len)
tlvlen += nla_total_size(extack->cookie_len);
+ if (err && nlk_has_extack && extack && extack->policy)
+ tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy);
if (tlvlen)
flags |= NLM_F_ACK_TLVS;
@@ -2433,6 +2454,9 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
if (extack->cookie_len)
WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
extack->cookie_len, extack->cookie));
+ if (extack->policy)
+ netlink_policy_dump_write_attr(skb, extack->policy,
+ NLMSGERR_ATTR_POLICY);
}
nlmsg_end(skb, rep);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index c4b4d3376227..c992424e4d63 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -107,16 +107,83 @@ static const struct genl_family *genl_family_find_byname(char *name)
return NULL;
}
-static const struct genl_ops *genl_get_cmd(u8 cmd,
- const struct genl_family *family)
+static int genl_get_cmd_cnt(const struct genl_family *family)
+{
+ return family->n_ops + family->n_small_ops;
+}
+
+static void genl_op_from_full(const struct genl_family *family,
+ unsigned int i, struct genl_ops *op)
+{
+ *op = family->ops[i];
+
+ if (!op->maxattr)
+ op->maxattr = family->maxattr;
+ if (!op->policy)
+ op->policy = family->policy;
+}
+
+static int genl_get_cmd_full(u32 cmd, const struct genl_family *family,
+ struct genl_ops *op)
{
int i;
for (i = 0; i < family->n_ops; i++)
- if (family->ops[i].cmd == cmd)
- return &family->ops[i];
+ if (family->ops[i].cmd == cmd) {
+ genl_op_from_full(family, i, op);
+ return 0;
+ }
- return NULL;
+ return -ENOENT;
+}
+
+static void genl_op_from_small(const struct genl_family *family,
+ unsigned int i, struct genl_ops *op)
+{
+ memset(op, 0, sizeof(*op));
+ op->doit = family->small_ops[i].doit;
+ op->dumpit = family->small_ops[i].dumpit;
+ op->cmd = family->small_ops[i].cmd;
+ op->internal_flags = family->small_ops[i].internal_flags;
+ op->flags = family->small_ops[i].flags;
+ op->validate = family->small_ops[i].validate;
+
+ op->maxattr = family->maxattr;
+ op->policy = family->policy;
+}
+
+static int genl_get_cmd_small(u32 cmd, const struct genl_family *family,
+ struct genl_ops *op)
+{
+ int i;
+
+ for (i = 0; i < family->n_small_ops; i++)
+ if (family->small_ops[i].cmd == cmd) {
+ genl_op_from_small(family, i, op);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int genl_get_cmd(u32 cmd, const struct genl_family *family,
+ struct genl_ops *op)
+{
+ if (!genl_get_cmd_full(cmd, family, op))
+ return 0;
+ return genl_get_cmd_small(cmd, family, op);
+}
+
+static void genl_get_cmd_by_index(unsigned int i,
+ const struct genl_family *family,
+ struct genl_ops *op)
+{
+ if (i < family->n_ops)
+ genl_op_from_full(family, i, op);
+ else if (i < family->n_ops + family->n_small_ops)
+ genl_op_from_small(family, i - family->n_ops, op);
+ else
+ WARN_ON_ONCE(1);
}
static int genl_allocate_reserve_groups(int n_groups, int *first_id)
@@ -222,7 +289,7 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
family->mcgrp_offset = first_id;
- /* if still initializing, can't and don't need to to realloc bitmaps */
+ /* if still initializing, can't and don't need to realloc bitmaps */
if (!init_net.genl_sock)
return 0;
@@ -286,22 +353,25 @@ static void genl_unregister_mc_groups(const struct genl_family *family)
static int genl_validate_ops(const struct genl_family *family)
{
- const struct genl_ops *ops = family->ops;
- unsigned int n_ops = family->n_ops;
int i, j;
- if (WARN_ON(n_ops && !ops))
+ if (WARN_ON(family->n_ops && !family->ops) ||
+ WARN_ON(family->n_small_ops && !family->small_ops))
return -EINVAL;
- if (!n_ops)
- return 0;
+ for (i = 0; i < genl_get_cmd_cnt(family); i++) {
+ struct genl_ops op;
- for (i = 0; i < n_ops; i++) {
- if (ops[i].dumpit == NULL && ops[i].doit == NULL)
+ genl_get_cmd_by_index(i, family, &op);
+ if (op.dumpit == NULL && op.doit == NULL)
return -EINVAL;
- for (j = i + 1; j < n_ops; j++)
- if (ops[i].cmd == ops[j].cmd)
+ for (j = i + 1; j < genl_get_cmd_cnt(family); j++) {
+ struct genl_ops op2;
+
+ genl_get_cmd_by_index(j, family, &op2);
+ if (op.cmd == op2.cmd)
return -EINVAL;
+ }
}
return 0;
@@ -467,16 +537,16 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family,
struct nlattr **attrbuf;
int err;
- if (!family->maxattr)
+ if (!ops->maxattr)
return NULL;
- attrbuf = kmalloc_array(family->maxattr + 1,
+ attrbuf = kmalloc_array(ops->maxattr + 1,
sizeof(struct nlattr *), GFP_KERNEL);
if (!attrbuf)
return ERR_PTR(-ENOMEM);
- err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr,
- family->policy, validate, extack);
+ err = __nlmsg_parse(nlh, hdrlen, attrbuf, ops->maxattr, ops->policy,
+ validate, extack);
if (err) {
kfree(attrbuf);
return ERR_PTR(err);
@@ -524,7 +594,7 @@ no_attrs:
return -ENOMEM;
}
info->family = ctx->family;
- info->ops = ops;
+ info->op = *ops;
info->attrs = attrs;
cb->data = info;
@@ -546,7 +616,7 @@ no_attrs:
static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
- const struct genl_ops *ops = genl_dumpit_info(cb)->ops;
+ const struct genl_ops *ops = &genl_dumpit_info(cb)->op;
int rc;
genl_lock();
@@ -558,7 +628,7 @@ static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
static int genl_lock_done(struct netlink_callback *cb)
{
const struct genl_dumpit_info *info = genl_dumpit_info(cb);
- const struct genl_ops *ops = info->ops;
+ const struct genl_ops *ops = &info->op;
int rc = 0;
if (ops->done) {
@@ -574,7 +644,7 @@ static int genl_lock_done(struct netlink_callback *cb)
static int genl_parallel_done(struct netlink_callback *cb)
{
const struct genl_dumpit_info *info = genl_dumpit_info(cb);
- const struct genl_ops *ops = info->ops;
+ const struct genl_ops *ops = &info->op;
int rc = 0;
if (ops->done)
@@ -682,9 +752,9 @@ static int genl_family_rcv_msg(const struct genl_family *family,
struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
- const struct genl_ops *ops;
struct net *net = sock_net(skb->sk);
struct genlmsghdr *hdr = nlmsg_data(nlh);
+ struct genl_ops op;
int hdrlen;
/* this family doesn't exist in this netns */
@@ -695,24 +765,23 @@ static int genl_family_rcv_msg(const struct genl_family *family,
if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
return -EINVAL;
- ops = genl_get_cmd(hdr->cmd, family);
- if (ops == NULL)
+ if (genl_get_cmd(hdr->cmd, family, &op))
return -EOPNOTSUPP;
- if ((ops->flags & GENL_ADMIN_PERM) &&
+ if ((op.flags & GENL_ADMIN_PERM) &&
!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
- if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
+ if ((op.flags & GENL_UNS_ADMIN_PERM) &&
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP)
return genl_family_rcv_msg_dumpit(family, skb, nlh, extack,
- ops, hdrlen, net);
+ &op, hdrlen, net);
else
return genl_family_rcv_msg_doit(family, skb, nlh, extack,
- ops, hdrlen, net);
+ &op, hdrlen, net);
}
static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -765,7 +834,7 @@ static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq,
nla_put_u32(skb, CTRL_ATTR_MAXATTR, family->maxattr))
goto nla_put_failure;
- if (family->n_ops) {
+ if (genl_get_cmd_cnt(family)) {
struct nlattr *nla_ops;
int i;
@@ -773,23 +842,25 @@ static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq,
if (nla_ops == NULL)
goto nla_put_failure;
- for (i = 0; i < family->n_ops; i++) {
+ for (i = 0; i < genl_get_cmd_cnt(family); i++) {
struct nlattr *nest;
- const struct genl_ops *ops = &family->ops[i];
- u32 op_flags = ops->flags;
+ struct genl_ops op;
+ u32 op_flags;
- if (ops->dumpit)
+ genl_get_cmd_by_index(i, family, &op);
+ op_flags = op.flags;
+ if (op.dumpit)
op_flags |= GENL_CMD_CAP_DUMP;
- if (ops->doit)
+ if (op.doit)
op_flags |= GENL_CMD_CAP_DO;
- if (family->policy)
+ if (op.policy)
op_flags |= GENL_CMD_CAP_HASPOL;
nest = nla_nest_start_noflag(skb, i + 1);
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_u32(skb, CTRL_ATTR_OP_ID, ops->cmd) ||
+ if (nla_put_u32(skb, CTRL_ATTR_OP_ID, op.cmd) ||
nla_put_u32(skb, CTRL_ATTR_OP_FLAGS, op_flags))
goto nla_put_failure;
@@ -945,7 +1016,7 @@ ctrl_build_mcgrp_msg(const struct genl_family *family,
return skb;
}
-static const struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] = {
+static const struct nla_policy ctrl_policy_family[] = {
[CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 },
[CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING,
.len = GENL_NAMSIZ - 1 },
@@ -1039,83 +1110,218 @@ static int genl_ctrl_event(int event, const struct genl_family *family,
return 0;
}
-static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb)
-{
+struct ctrl_dump_policy_ctx {
+ struct netlink_policy_dump_state *state;
const struct genl_family *rt;
- unsigned int fam_id = cb->args[0];
- int err;
+ unsigned int opidx;
+ u32 op;
+ u16 fam_id;
+ u8 policies:1,
+ single_op:1;
+};
- if (!fam_id) {
- struct nlattr *tb[CTRL_ATTR_MAX + 1];
+static const struct nla_policy ctrl_policy_policy[] = {
+ [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 },
+ [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING,
+ .len = GENL_NAMSIZ - 1 },
+ [CTRL_ATTR_OP] = { .type = NLA_U32 },
+};
- err = genlmsg_parse(cb->nlh, &genl_ctrl, tb,
- genl_ctrl.maxattr,
- genl_ctrl.policy, cb->extack);
- if (err)
- return err;
+static int ctrl_dumppolicy_start(struct netlink_callback *cb)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
+ struct nlattr **tb = info->attrs;
+ const struct genl_family *rt;
+ struct genl_ops op;
+ int err, i;
- if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME])
- return -EINVAL;
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
- if (tb[CTRL_ATTR_FAMILY_ID]) {
- fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]);
- } else {
- rt = genl_family_find_byname(
- nla_data(tb[CTRL_ATTR_FAMILY_NAME]));
- if (!rt)
- return -ENOENT;
- fam_id = rt->id;
- }
+ if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME])
+ return -EINVAL;
+
+ if (tb[CTRL_ATTR_FAMILY_ID]) {
+ ctx->fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]);
+ } else {
+ rt = genl_family_find_byname(
+ nla_data(tb[CTRL_ATTR_FAMILY_NAME]));
+ if (!rt)
+ return -ENOENT;
+ ctx->fam_id = rt->id;
}
- rt = genl_family_find_byid(fam_id);
+ rt = genl_family_find_byid(ctx->fam_id);
if (!rt)
return -ENOENT;
- if (!rt->policy)
+ ctx->rt = rt;
+
+ if (tb[CTRL_ATTR_OP]) {
+ ctx->single_op = true;
+ ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]);
+
+ err = genl_get_cmd(ctx->op, rt, &op);
+ if (err) {
+ NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]);
+ return err;
+ }
+
+ if (!op.policy)
+ return -ENODATA;
+
+ return netlink_policy_dump_add_policy(&ctx->state, op.policy,
+ op.maxattr);
+ }
+
+ for (i = 0; i < genl_get_cmd_cnt(rt); i++) {
+ genl_get_cmd_by_index(i, rt, &op);
+
+ if (op.policy) {
+ err = netlink_policy_dump_add_policy(&ctx->state,
+ op.policy,
+ op.maxattr);
+ if (err)
+ return err;
+ }
+ }
+
+ if (!ctx->state)
return -ENODATA;
+ return 0;
+}
- err = netlink_policy_dump_start(rt->policy, rt->maxattr, &cb->args[1]);
- if (err)
- return err;
+static void *ctrl_dumppolicy_prep(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &genl_ctrl,
+ NLM_F_MULTI, CTRL_CMD_GETPOLICY);
+ if (!hdr)
+ return NULL;
+
+ if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, ctx->fam_id))
+ return NULL;
+
+ return hdr;
+}
+
+static int ctrl_dumppolicy_put_op(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct genl_ops *op)
+{
+ struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
+ struct nlattr *nest_pol, *nest_op;
+ void *hdr;
+ int idx;
+
+ /* skip if we have nothing to show */
+ if (!op->policy)
+ return 0;
+ if (!op->doit &&
+ (!op->dumpit || op->validate & GENL_DONT_VALIDATE_DUMP))
+ return 0;
+
+ hdr = ctrl_dumppolicy_prep(skb, cb);
+ if (!hdr)
+ return -ENOBUFS;
+
+ nest_pol = nla_nest_start(skb, CTRL_ATTR_OP_POLICY);
+ if (!nest_pol)
+ goto err;
+
+ nest_op = nla_nest_start(skb, op->cmd);
+ if (!nest_op)
+ goto err;
+
+ /* for now both do/dump are always the same */
+ idx = netlink_policy_dump_get_policy_idx(ctx->state,
+ op->policy,
+ op->maxattr);
+
+ if (op->doit && nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx))
+ goto err;
+
+ if (op->dumpit && !(op->validate & GENL_DONT_VALIDATE_DUMP) &&
+ nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx))
+ goto err;
+
+ nla_nest_end(skb, nest_op);
+ nla_nest_end(skb, nest_pol);
+ genlmsg_end(skb, hdr);
- while (netlink_policy_dump_loop(cb->args[1])) {
- void *hdr;
+ return 0;
+err:
+ genlmsg_cancel(skb, hdr);
+ return -ENOBUFS;
+}
+
+static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
+ void *hdr;
+
+ if (!ctx->policies) {
+ while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) {
+ struct genl_ops op;
+
+ if (ctx->single_op) {
+ int err;
+
+ err = genl_get_cmd(ctx->op, ctx->rt, &op);
+ if (WARN_ON(err))
+ return skb->len;
+
+ /* break out of the loop after this one */
+ ctx->opidx = genl_get_cmd_cnt(ctx->rt);
+ } else {
+ genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op);
+ }
+
+ if (ctrl_dumppolicy_put_op(skb, cb, &op))
+ return skb->len;
+
+ ctx->opidx++;
+ }
+
+ /* completed with the per-op policy index list */
+ ctx->policies = true;
+ }
+
+ while (netlink_policy_dump_loop(ctx->state)) {
struct nlattr *nest;
- hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, &genl_ctrl,
- NLM_F_MULTI, CTRL_CMD_GETPOLICY);
+ hdr = ctrl_dumppolicy_prep(skb, cb);
if (!hdr)
goto nla_put_failure;
- if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, rt->id))
- goto nla_put_failure;
-
nest = nla_nest_start(skb, CTRL_ATTR_POLICY);
if (!nest)
goto nla_put_failure;
- if (netlink_policy_dump_write(skb, cb->args[1]))
+ if (netlink_policy_dump_write(skb, ctx->state))
goto nla_put_failure;
nla_nest_end(skb, nest);
genlmsg_end(skb, hdr);
- continue;
-
-nla_put_failure:
- genlmsg_cancel(skb, hdr);
- break;
}
- cb->args[0] = fam_id;
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
return skb->len;
}
static int ctrl_dumppolicy_done(struct netlink_callback *cb)
{
- netlink_policy_dump_free(cb->args[1]);
+ struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
+
+ netlink_policy_dump_free(ctx->state);
return 0;
}
@@ -1123,11 +1329,16 @@ static const struct genl_ops genl_ctrl_ops[] = {
{
.cmd = CTRL_CMD_GETFAMILY,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .policy = ctrl_policy_family,
+ .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1,
.doit = ctrl_getfamily,
.dumpit = ctrl_dumpfamily,
},
{
.cmd = CTRL_CMD_GETPOLICY,
+ .policy = ctrl_policy_policy,
+ .maxattr = ARRAY_SIZE(ctrl_policy_policy) - 1,
+ .start = ctrl_dumppolicy_start,
.dumpit = ctrl_dumppolicy,
.done = ctrl_dumppolicy_done,
},
@@ -1146,8 +1357,6 @@ static struct genl_family genl_ctrl __ro_after_init = {
.id = GENL_ID_CTRL,
.name = "nlctrl",
.version = 0x2,
- .maxattr = CTRL_ATTR_MAX,
- .policy = ctrl_policy,
.netnsok = true,
};
diff --git a/net/netlink/policy.c b/net/netlink/policy.c
index 0176b59ce530..8d7c900e27f4 100644
--- a/net/netlink/policy.c
+++ b/net/netlink/policy.c
@@ -14,7 +14,7 @@
#define INITIAL_POLICIES_ALLOC 10
-struct nl_policy_dump {
+struct netlink_policy_dump_state {
unsigned int policy_idx;
unsigned int attr_idx;
unsigned int n_alloc;
@@ -24,18 +24,19 @@ struct nl_policy_dump {
} policies[];
};
-static int add_policy(struct nl_policy_dump **statep,
+static int add_policy(struct netlink_policy_dump_state **statep,
const struct nla_policy *policy,
unsigned int maxtype)
{
- struct nl_policy_dump *state = *statep;
+ struct netlink_policy_dump_state *state = *statep;
unsigned int n_alloc, i;
if (!policy || !maxtype)
return 0;
for (i = 0; i < state->n_alloc; i++) {
- if (state->policies[i].policy == policy)
+ if (state->policies[i].policy == policy &&
+ state->policies[i].maxtype == maxtype)
return 0;
if (!state->policies[i].policy) {
@@ -62,42 +63,85 @@ static int add_policy(struct nl_policy_dump **statep,
return 0;
}
-static unsigned int get_policy_idx(struct nl_policy_dump *state,
- const struct nla_policy *policy)
+/**
+ * netlink_policy_dump_get_policy_idx - retrieve policy index
+ * @state: the policy dump state
+ * @policy: the policy to find
+ * @maxtype: the policy's maxattr
+ *
+ * Returns: the index of the given policy in the dump state
+ *
+ * Call this to find a policy index when you've added multiple and e.g.
+ * need to tell userspace which command has which policy (by index).
+ *
+ * Note: this will WARN and return 0 if the policy isn't found, which
+ * means it wasn't added in the first place, which would be an
+ * internal consistency bug.
+ */
+int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state,
+ const struct nla_policy *policy,
+ unsigned int maxtype)
{
unsigned int i;
+ if (WARN_ON(!policy || !maxtype))
+ return 0;
+
for (i = 0; i < state->n_alloc; i++) {
- if (state->policies[i].policy == policy)
+ if (state->policies[i].policy == policy &&
+ state->policies[i].maxtype == maxtype)
return i;
}
- WARN_ON_ONCE(1);
- return -1;
+ WARN_ON(1);
+ return 0;
+}
+
+static struct netlink_policy_dump_state *alloc_state(void)
+{
+ struct netlink_policy_dump_state *state;
+
+ state = kzalloc(struct_size(state, policies, INITIAL_POLICIES_ALLOC),
+ GFP_KERNEL);
+ if (!state)
+ return ERR_PTR(-ENOMEM);
+ state->n_alloc = INITIAL_POLICIES_ALLOC;
+
+ return state;
}
-int netlink_policy_dump_start(const struct nla_policy *policy,
- unsigned int maxtype,
- unsigned long *_state)
+/**
+ * netlink_policy_dump_add_policy - add a policy to the dump
+ * @pstate: state to add to, may be reallocated, must be %NULL the first time
+ * @policy: the new policy to add to the dump
+ * @maxtype: the new policy's max attr type
+ *
+ * Returns: 0 on success, a negative error code otherwise.
+ *
+ * Call this to allocate a policy dump state, and to add policies to it. This
+ * should be called from the dump start() callback.
+ *
+ * Note: on failures, any previously allocated state is freed.
+ */
+int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate,
+ const struct nla_policy *policy,
+ unsigned int maxtype)
{
- struct nl_policy_dump *state;
+ struct netlink_policy_dump_state *state = *pstate;
unsigned int policy_idx;
int err;
- if (*_state)
- return 0;
+ if (!state) {
+ state = alloc_state();
+ if (IS_ERR(state))
+ return PTR_ERR(state);
+ }
/*
* walk the policies and nested ones first, and build
* a linear list of them.
*/
- state = kzalloc(struct_size(state, policies, INITIAL_POLICIES_ALLOC),
- GFP_KERNEL);
- if (!state)
- return -ENOMEM;
- state->n_alloc = INITIAL_POLICIES_ALLOC;
-
err = add_policy(&state, policy, maxtype);
if (err)
return err;
@@ -128,62 +172,103 @@ int netlink_policy_dump_start(const struct nla_policy *policy,
}
}
- *_state = (unsigned long)state;
-
+ *pstate = state;
return 0;
}
-static bool netlink_policy_dump_finished(struct nl_policy_dump *state)
+static bool
+netlink_policy_dump_finished(struct netlink_policy_dump_state *state)
{
return state->policy_idx >= state->n_alloc ||
!state->policies[state->policy_idx].policy;
}
-bool netlink_policy_dump_loop(unsigned long _state)
+/**
+ * netlink_policy_dump_loop - dumping loop indicator
+ * @state: the policy dump state
+ *
+ * Returns: %true if the dump continues, %false otherwise
+ *
+ * Note: this frees the dump state when finishing
+ */
+bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state)
{
- struct nl_policy_dump *state = (void *)_state;
-
return !netlink_policy_dump_finished(state);
}
-int netlink_policy_dump_write(struct sk_buff *skb, unsigned long _state)
+int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt)
{
- struct nl_policy_dump *state = (void *)_state;
- const struct nla_policy *pt;
- struct nlattr *policy, *attr;
- enum netlink_attribute_type type;
- bool again;
+ /* nested + type */
+ int common = 2 * nla_attr_size(sizeof(u32));
-send_attribute:
- again = false;
+ switch (pt->type) {
+ case NLA_UNSPEC:
+ case NLA_REJECT:
+ /* these actually don't need any space */
+ return 0;
+ case NLA_NESTED:
+ case NLA_NESTED_ARRAY:
+ /* common, policy idx, policy maxattr */
+ return common + 2 * nla_attr_size(sizeof(u32));
+ case NLA_U8:
+ case NLA_U16:
+ case NLA_U32:
+ case NLA_U64:
+ case NLA_MSECS:
+ case NLA_S8:
+ case NLA_S16:
+ case NLA_S32:
+ case NLA_S64:
+ /* maximum is common, u64 min/max with padding */
+ return common +
+ 2 * (nla_attr_size(0) + nla_attr_size(sizeof(u64)));
+ case NLA_BITFIELD32:
+ return common + nla_attr_size(sizeof(u32));
+ case NLA_STRING:
+ case NLA_NUL_STRING:
+ case NLA_BINARY:
+ /* maximum is common, u32 min-length/max-length */
+ return common + 2 * nla_attr_size(sizeof(u32));
+ case NLA_FLAG:
+ return common;
+ }
- pt = &state->policies[state->policy_idx].policy[state->attr_idx];
+ /* this should then cause a warning later */
+ return 0;
+}
- policy = nla_nest_start(skb, state->policy_idx);
- if (!policy)
- return -ENOBUFS;
+static int
+__netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state,
+ struct sk_buff *skb,
+ const struct nla_policy *pt,
+ int nestattr)
+{
+ int estimate = netlink_policy_dump_attr_size_estimate(pt);
+ enum netlink_attribute_type type;
+ struct nlattr *attr;
- attr = nla_nest_start(skb, state->attr_idx);
+ attr = nla_nest_start(skb, nestattr);
if (!attr)
- goto nla_put_failure;
+ return -ENOBUFS;
switch (pt->type) {
default:
case NLA_UNSPEC:
case NLA_REJECT:
/* skip - use NLA_MIN_LEN to advertise such */
- nla_nest_cancel(skb, policy);
- again = true;
- goto next;
+ nla_nest_cancel(skb, attr);
+ return -ENODATA;
case NLA_NESTED:
type = NL_ATTR_TYPE_NESTED;
fallthrough;
case NLA_NESTED_ARRAY:
if (pt->type == NLA_NESTED_ARRAY)
type = NL_ATTR_TYPE_NESTED_ARRAY;
- if (pt->nested_policy && pt->len &&
+ if (state && pt->nested_policy && pt->len &&
(nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_IDX,
- get_policy_idx(state, pt->nested_policy)) ||
+ netlink_policy_dump_get_policy_idx(state,
+ pt->nested_policy,
+ pt->len)) ||
nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE,
pt->len)))
goto nla_put_failure;
@@ -204,6 +289,14 @@ send_attribute:
else
type = NL_ATTR_TYPE_U64;
+ if (pt->validation_type == NLA_VALIDATE_MASK) {
+ if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MASK,
+ pt->mask,
+ NL_POLICY_TYPE_ATTR_PAD))
+ goto nla_put_failure;
+ break;
+ }
+
nla_get_range_unsigned(pt, &range);
if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_U,
@@ -243,12 +336,6 @@ send_attribute:
pt->bitfield32_valid))
goto nla_put_failure;
break;
- case NLA_EXACT_LEN:
- type = NL_ATTR_TYPE_BINARY;
- if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len) ||
- nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, pt->len))
- goto nla_put_failure;
- break;
case NLA_STRING:
case NLA_NUL_STRING:
case NLA_BINARY:
@@ -258,14 +345,27 @@ send_attribute:
type = NL_ATTR_TYPE_NUL_STRING;
else
type = NL_ATTR_TYPE_BINARY;
- if (pt->len && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH,
- pt->len))
- goto nla_put_failure;
- break;
- case NLA_MIN_LEN:
- type = NL_ATTR_TYPE_BINARY;
- if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len))
+
+ if (pt->validation_type == NLA_VALIDATE_RANGE ||
+ pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG) {
+ struct netlink_range_validation range;
+
+ nla_get_range_unsigned(pt, &range);
+
+ if (range.min &&
+ nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH,
+ range.min))
+ goto nla_put_failure;
+
+ if (range.max < U16_MAX &&
+ nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH,
+ range.max))
+ goto nla_put_failure;
+ } else if (pt->len &&
+ nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH,
+ pt->len)) {
goto nla_put_failure;
+ }
break;
case NLA_FLAG:
type = NL_ATTR_TYPE_FLAG;
@@ -275,8 +375,66 @@ send_attribute:
if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_TYPE, type))
goto nla_put_failure;
- /* finish and move state to next attribute */
nla_nest_end(skb, attr);
+ WARN_ON(attr->nla_len > estimate);
+
+ return 0;
+nla_put_failure:
+ nla_nest_cancel(skb, attr);
+ return -ENOBUFS;
+}
+
+/**
+ * netlink_policy_dump_write_attr - write a given attribute policy
+ * @skb: the message skb to write to
+ * @pt: the attribute's policy
+ * @nestattr: the nested attribute ID to use
+ *
+ * Returns: 0 on success, an error code otherwise; -%ENODATA is
+ * special, indicating that there's no policy data and
+ * the attribute is generally rejected.
+ */
+int netlink_policy_dump_write_attr(struct sk_buff *skb,
+ const struct nla_policy *pt,
+ int nestattr)
+{
+ return __netlink_policy_dump_write_attr(NULL, skb, pt, nestattr);
+}
+
+/**
+ * netlink_policy_dump_write - write current policy dump attributes
+ * @skb: the message skb to write to
+ * @state: the policy dump state
+ *
+ * Returns: 0 on success, an error code otherwise
+ */
+int netlink_policy_dump_write(struct sk_buff *skb,
+ struct netlink_policy_dump_state *state)
+{
+ const struct nla_policy *pt;
+ struct nlattr *policy;
+ bool again;
+ int err;
+
+send_attribute:
+ again = false;
+
+ pt = &state->policies[state->policy_idx].policy[state->attr_idx];
+
+ policy = nla_nest_start(skb, state->policy_idx);
+ if (!policy)
+ return -ENOBUFS;
+
+ err = __netlink_policy_dump_write_attr(state, skb, pt, state->attr_idx);
+ if (err == -ENODATA) {
+ nla_nest_cancel(skb, policy);
+ again = true;
+ goto next;
+ } else if (err) {
+ goto nla_put_failure;
+ }
+
+ /* finish and move state to next attribute */
nla_nest_end(skb, policy);
next:
@@ -299,9 +457,13 @@ nla_put_failure:
return -ENOBUFS;
}
-void netlink_policy_dump_free(unsigned long _state)
+/**
+ * netlink_policy_dump_free - free policy dump state
+ * @state: the policy dump state to free
+ *
+ * Call this from the done() method to ensure dump state is freed.
+ */
+void netlink_policy_dump_free(struct netlink_policy_dump_state *state)
{
- struct nl_policy_dump *state = (void *)_state;
-
kfree(state);
}
diff --git a/net/nfc/Kconfig b/net/nfc/Kconfig
index 9b27599870e3..96b91674dd37 100644
--- a/net/nfc/Kconfig
+++ b/net/nfc/Kconfig
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
#
-# NFC sybsystem configuration
+# NFC subsystem configuration
#
menuconfig NFC
diff --git a/net/nfc/core.c b/net/nfc/core.c
index eb377f87bcae..573c80c6ff7a 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -189,7 +189,8 @@ static const struct rfkill_ops nfc_rfkill_ops = {
* nfc_start_poll - start polling for nfc targets
*
* @dev: The nfc device that must start polling
- * @protocols: bitset of nfc protocols that must be used for polling
+ * @im_protocols: bitset of nfc initiator protocols to be used for polling
+ * @tm_protocols: bitset of nfc transport protocols to be used for polling
*
* The device remains polling for targets until a target is found or
* the nfc_stop_poll function is called.
@@ -436,6 +437,7 @@ error:
*
* @dev: The nfc device that found the target
* @target_idx: index of the target that must be deactivated
+ * @mode: idle or sleep?
*/
int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode)
{
@@ -703,7 +705,11 @@ EXPORT_SYMBOL(nfc_tm_deactivated);
/**
* nfc_alloc_send_skb - allocate a skb for data exchange responses
*
+ * @dev: device sending the response
+ * @sk: socket sending the response
+ * @flags: MSG_DONTWAIT flag
* @size: size to allocate
+ * @err: pointer to memory to store the error code
*/
struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk,
unsigned int flags, unsigned int size,
@@ -1039,6 +1045,8 @@ struct nfc_dev *nfc_get_device(unsigned int idx)
*
* @ops: device operations
* @supported_protocols: NFC protocols supported by the device
+ * @tx_headroom: reserved space at beginning of skb
+ * @tx_tailroom: reserved space at end of skb
*/
struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops,
u32 supported_protocols,
diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c
index e3599ed4a7a8..da7e2112771f 100644
--- a/net/nfc/digital_core.c
+++ b/net/nfc/digital_core.c
@@ -458,6 +458,9 @@ static void digital_add_poll_tech(struct nfc_digital_dev *ddev, u8 rf_tech,
/**
* start_poll operation
+ * @nfc_dev: device to be polled
+ * @im_protocols: bitset of nfc initiator protocols to be used for polling
+ * @tm_protocols: bitset of nfc transport protocols to be used for polling
*
* For every supported protocol, the corresponding polling function is added
* to the table of polling technologies (ddev->poll_techs[]) using
diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c
index 304b1a9bb18a..5971fb6f51cc 100644
--- a/net/nfc/digital_dep.c
+++ b/net/nfc/digital_dep.c
@@ -38,9 +38,6 @@
#define DIGITAL_GB_BIT 0x02
-#define DIGITAL_NFC_DEP_REQ_RES_HEADROOM 2 /* SoD: [SB (NFC-A)] + LEN */
-#define DIGITAL_NFC_DEP_REQ_RES_TAILROOM 2 /* EoD: 2-byte CRC */
-
#define DIGITAL_NFC_DEP_PFB_TYPE(pfb) ((pfb) & 0xE0)
#define DIGITAL_NFC_DEP_PFB_TIMEOUT_BIT 0x10
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 741da8f81c2b..02a1f13f0798 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -165,7 +165,12 @@ static void nci_reset_req(struct nci_dev *ndev, unsigned long opt)
static void nci_init_req(struct nci_dev *ndev, unsigned long opt)
{
- nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, 0, NULL);
+ u8 plen = 0;
+
+ if (opt)
+ plen = sizeof(struct nci_core_init_v2_cmd);
+
+ nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, plen, (void *)opt);
}
static void nci_init_complete_req(struct nci_dev *ndev, unsigned long opt)
@@ -497,7 +502,16 @@ static int nci_open_device(struct nci_dev *ndev)
}
if (!rc) {
- rc = __nci_request(ndev, nci_init_req, 0,
+ struct nci_core_init_v2_cmd nci_init_v2_cmd = {
+ .feature1 = NCI_FEATURE_DISABLE,
+ .feature2 = NCI_FEATURE_DISABLE
+ };
+ unsigned long opt = 0;
+
+ if (ndev->nci_ver & NCI_VER_2_MASK)
+ opt = (unsigned long)&nci_init_v2_cmd;
+
+ rc = __nci_request(ndev, nci_init_req, opt,
msecs_to_jiffies(NCI_INIT_TIMEOUT));
}
@@ -1112,6 +1126,8 @@ static struct nfc_ops nci_nfc_ops = {
*
* @ops: device operations
* @supported_protocols: NFC protocols supported by the device
+ * @tx_headroom: Reserved space at beginning of skb
+ * @tx_tailroom: Reserved space at end of skb
*/
struct nci_dev *nci_allocate_device(struct nci_ops *ops,
__u32 supported_protocols,
diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c
index c18e76d6d8ba..6b275a387a92 100644
--- a/net/nfc/nci/hci.c
+++ b/net/nfc/nci/hci.c
@@ -363,16 +363,13 @@ exit:
}
static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe,
- u8 result, struct sk_buff *skb)
+ struct sk_buff *skb)
{
struct nci_conn_info *conn_info;
- u8 status = result;
conn_info = ndev->hci_dev->conn_info;
- if (!conn_info) {
- status = NCI_STATUS_REJECTED;
+ if (!conn_info)
goto exit;
- }
conn_info->rx_skb = skb;
@@ -388,7 +385,7 @@ static void nci_hci_hcp_message_rx(struct nci_dev *ndev, u8 pipe,
{
switch (type) {
case NCI_HCI_HCP_RESPONSE:
- nci_hci_resp_received(ndev, pipe, instruction, skb);
+ nci_hci_resp_received(ndev, pipe, skb);
break;
case NCI_HCI_HCP_COMMAND:
nci_hci_cmd_received(ndev, pipe, instruction, skb);
diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c
index 33e1170817f0..98af04c86b2c 100644
--- a/net/nfc/nci/ntf.c
+++ b/net/nfc/nci/ntf.c
@@ -27,6 +27,23 @@
/* Handle NCI Notification packets */
+static void nci_core_reset_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ /* Handle NCI 2.x core reset notification */
+ struct nci_core_reset_ntf *ntf = (void *)skb->data;
+
+ ndev->nci_ver = ntf->nci_ver;
+ pr_debug("nci_ver 0x%x, config_status 0x%x\n",
+ ntf->nci_ver, ntf->config_status);
+
+ ndev->manufact_id = ntf->manufact_id;
+ ndev->manufact_specific_info =
+ __le32_to_cpu(ntf->manufact_specific_info);
+
+ nci_req_complete(ndev, NCI_STATUS_OK);
+}
+
static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev,
struct sk_buff *skb)
{
@@ -756,6 +773,10 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb)
}
switch (ntf_opcode) {
+ case NCI_OP_CORE_RESET_NTF:
+ nci_core_reset_ntf_packet(ndev, skb);
+ break;
+
case NCI_OP_CORE_CONN_CREDITS_NTF:
nci_core_conn_credits_ntf_packet(ndev, skb);
break;
diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c
index a48297b79f34..e9605922a322 100644
--- a/net/nfc/nci/rsp.c
+++ b/net/nfc/nci/rsp.c
@@ -31,16 +31,19 @@ static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
pr_debug("status 0x%x\n", rsp->status);
- if (rsp->status == NCI_STATUS_OK) {
- ndev->nci_ver = rsp->nci_ver;
- pr_debug("nci_ver 0x%x, config_status 0x%x\n",
- rsp->nci_ver, rsp->config_status);
- }
+ /* Handle NCI 1.x ver */
+ if (skb->len != 1) {
+ if (rsp->status == NCI_STATUS_OK) {
+ ndev->nci_ver = rsp->nci_ver;
+ pr_debug("nci_ver 0x%x, config_status 0x%x\n",
+ rsp->nci_ver, rsp->config_status);
+ }
- nci_req_complete(ndev, rsp->status);
+ nci_req_complete(ndev, rsp->status);
+ }
}
-static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
+static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, struct sk_buff *skb)
{
struct nci_core_init_rsp_1 *rsp_1 = (void *) skb->data;
struct nci_core_init_rsp_2 *rsp_2;
@@ -48,16 +51,14 @@ static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
pr_debug("status 0x%x\n", rsp_1->status);
if (rsp_1->status != NCI_STATUS_OK)
- goto exit;
+ return rsp_1->status;
ndev->nfcc_features = __le32_to_cpu(rsp_1->nfcc_features);
ndev->num_supported_rf_interfaces = rsp_1->num_supported_rf_interfaces;
- if (ndev->num_supported_rf_interfaces >
- NCI_MAX_SUPPORTED_RF_INTERFACES) {
- ndev->num_supported_rf_interfaces =
- NCI_MAX_SUPPORTED_RF_INTERFACES;
- }
+ ndev->num_supported_rf_interfaces =
+ min((int)ndev->num_supported_rf_interfaces,
+ NCI_MAX_SUPPORTED_RF_INTERFACES);
memcpy(ndev->supported_rf_interfaces,
rsp_1->supported_rf_interfaces,
@@ -77,6 +78,58 @@ static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
ndev->manufact_specific_info =
__le32_to_cpu(rsp_2->manufact_specific_info);
+ return NCI_STATUS_OK;
+}
+
+static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, struct sk_buff *skb)
+{
+ struct nci_core_init_rsp_nci_ver2 *rsp = (void *)skb->data;
+ u8 *supported_rf_interface = rsp->supported_rf_interfaces;
+ u8 rf_interface_idx = 0;
+ u8 rf_extension_cnt = 0;
+
+ pr_debug("status %x\n", rsp->status);
+
+ if (rsp->status != NCI_STATUS_OK)
+ return rsp->status;
+
+ ndev->nfcc_features = __le32_to_cpu(rsp->nfcc_features);
+ ndev->num_supported_rf_interfaces = rsp->num_supported_rf_interfaces;
+
+ ndev->num_supported_rf_interfaces =
+ min((int)ndev->num_supported_rf_interfaces,
+ NCI_MAX_SUPPORTED_RF_INTERFACES);
+
+ while (rf_interface_idx < ndev->num_supported_rf_interfaces) {
+ ndev->supported_rf_interfaces[rf_interface_idx++] = *supported_rf_interface++;
+
+ /* skip rf extension parameters */
+ rf_extension_cnt = *supported_rf_interface++;
+ supported_rf_interface += rf_extension_cnt;
+ }
+
+ ndev->max_logical_connections = rsp->max_logical_connections;
+ ndev->max_routing_table_size =
+ __le16_to_cpu(rsp->max_routing_table_size);
+ ndev->max_ctrl_pkt_payload_len =
+ rsp->max_ctrl_pkt_payload_len;
+ ndev->max_size_for_large_params = NCI_MAX_LARGE_PARAMS_NCI_v2;
+
+ return NCI_STATUS_OK;
+}
+
+static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
+{
+ u8 status = 0;
+
+ if (!(ndev->nci_ver & NCI_VER_2_MASK))
+ status = nci_core_init_rsp_packet_v1(ndev, skb);
+ else
+ status = nci_core_init_rsp_packet_v2(ndev, skb);
+
+ if (status != NCI_STATUS_OK)
+ goto exit;
+
pr_debug("nfcc_features 0x%x\n",
ndev->nfcc_features);
pr_debug("num_supported_rf_interfaces %d\n",
@@ -103,7 +156,7 @@ static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
ndev->manufact_specific_info);
exit:
- nci_req_complete(ndev, rsp_1->status);
+ nci_req_complete(ndev, status);
}
static void nci_core_set_config_rsp_packet(struct nci_dev *ndev,
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index e894254c17d4..e161ef2d4720 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -852,6 +852,7 @@ static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info)
if (!dev->polling) {
device_unlock(&dev->dev);
+ nfc_put_device(dev);
return -EINVAL;
}
@@ -1217,7 +1218,7 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info)
u32 idx;
char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1];
- if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME])
return -EINVAL;
idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
@@ -1226,7 +1227,7 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info)
if (!dev)
return -ENODEV;
- nla_strlcpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME],
+ nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME],
sizeof(firmware_name));
rc = nfc_fw_download(dev, firmware_name);
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 955c195ae14b..9c7eb8455ba8 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -105,7 +105,7 @@ static int rawsock_connect(struct socket *sock, struct sockaddr *_addr,
if (addr->target_idx > dev->target_next_idx - 1 ||
addr->target_idx < dev->target_next_idx - dev->n_targets) {
rc = -EINVAL;
- goto error;
+ goto put_dev;
}
rc = nfc_activate_target(dev, addr->target_idx, addr->nfc_protocol);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 2611657f40ca..e8902a7e60f2 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -9,7 +9,6 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/openvswitch.h>
-#include <linux/netfilter_ipv6.h>
#include <linux/sctp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -200,6 +199,9 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
__be32 lse;
int err;
+ if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
+ return -ENOMEM;
+
stack = mpls_hdr(skb);
lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask);
err = skb_mpls_update_lse(skb, lse);
@@ -278,9 +280,11 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
*/
static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key)
{
- skb_pull_rcsum(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- skb_reset_mac_len(skb);
+ int err;
+
+ err = skb_eth_pop(skb);
+ if (err)
+ return err;
/* safe right before invalidate_flow_key */
key->mac_proto = MAC_PROTO_NONE;
@@ -291,22 +295,12 @@ static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key)
static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_eth *ethh)
{
- struct ethhdr *hdr;
-
- /* Add the new Ethernet header */
- if (skb_cow_head(skb, ETH_HLEN) < 0)
- return -ENOMEM;
-
- skb_push(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- skb_reset_mac_len(skb);
-
- hdr = eth_hdr(skb);
- ether_addr_copy(hdr->h_source, ethh->addresses.eth_src);
- ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst);
- hdr->h_proto = skb->protocol;
+ int err;
- skb_postpush_rcsum(skb, hdr, ETH_HLEN);
+ err = skb_eth_push(skb, ethh->addresses.eth_dst,
+ ethh->addresses.eth_src);
+ if (err)
+ return err;
/* safe right before invalidate_flow_key */
key->mac_proto = MAC_PROTO_ETHERNET;
@@ -742,7 +736,8 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}
-static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int ovs_vport_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
struct vport *vport = data->vport;
@@ -848,13 +843,9 @@ static void ovs_fragment(struct net *net, struct vport *vport,
ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
} else if (key->eth.type == htons(ETH_P_IPV6)) {
- const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
unsigned long orig_dst;
struct rt6_info ovs_rt;
- if (!v6ops)
- goto err;
-
prepare_frag(vport, skb, orig_network_offset,
ovs_key_mac_proto(key));
memset(&ovs_rt, 0, sizeof(ovs_rt));
@@ -866,7 +857,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
skb_dst_set_noref(skb, &ovs_rt.dst);
IP6CB(skb)->frag_max_size = mru;
- v6ops->fragment(net, skb->sk, skb, ovs_vport_output);
+ ipv6_stub->ipv6_fragment(net, skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
} else {
WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
@@ -925,7 +916,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
upcall.mru = OVS_CB(skb)->mru;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
- a = nla_next(a, &rem)) {
+ a = nla_next(a, &rem)) {
switch (nla_type(a)) {
case OVS_USERSPACE_ATTR_USERDATA:
upcall.userdata = a;
@@ -968,17 +959,13 @@ static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct nlattr *attr, bool last)
{
- /* The first action is always 'OVS_DEC_TTL_ATTR_ARG'. */
- struct nlattr *dec_ttl_arg = nla_data(attr);
- int rem = nla_len(attr);
+ /* The first attribute is always 'OVS_DEC_TTL_ATTR_ACTION'. */
+ struct nlattr *actions = nla_data(attr);
- if (nla_len(dec_ttl_arg)) {
- struct nlattr *actions = nla_next(dec_ttl_arg, &rem);
+ if (nla_len(actions))
+ return clone_execute(dp, skb, key, 0, nla_data(actions),
+ nla_len(actions), last, false);
- if (actions)
- return clone_execute(dp, skb, key, 0, actions, rem,
- last, false);
- }
consume_skb(skb);
return 0;
}
@@ -1222,7 +1209,7 @@ static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
return -EHOSTUNREACH;
key->ip.ttl = --nh->hop_limit;
- } else {
+ } else if (skb->protocol == htons(ETH_P_IP)) {
struct iphdr *nh;
u8 old_ttl;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 12d42ab0193b..5eddfe7bd391 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1037,6 +1037,14 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
return -EINVAL;
}
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP &&
+ nf_ct_is_confirmed(ct) && nf_conntrack_tcp_established(ct)) {
+ /* Be liberal for tcp packets so that out-of-window
+ * packets are not marked invalid.
+ */
+ nf_ct_set_tcp_be_liberal(ct);
+ }
}
return 0;
@@ -1905,8 +1913,8 @@ static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
lockdep_ovsl_is_held())
kfree_rcu(ct_limit, rcu);
}
- kfree(ovs_net->ct_limit_info->limits);
- kfree(ovs_net->ct_limit_info);
+ kfree(info->limits);
+ kfree(info);
}
static struct sk_buff *
@@ -2025,15 +2033,11 @@ static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
struct sk_buff *reply)
{
struct ovs_zone_limit zone_limit;
- int err;
zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE;
zone_limit.limit = info->default_limit;
- err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
- if (err)
- return err;
- return 0;
+ return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
}
static int __ovs_ct_limit_get_zone_limit(struct net *net,
@@ -2235,7 +2239,7 @@ exit_err:
return err;
}
-static struct genl_ops ct_limit_genl_ops[] = {
+static const struct genl_small_ops ct_limit_genl_ops[] = {
{ .cmd = OVS_CT_LIMIT_CMD_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
@@ -2267,8 +2271,8 @@ struct genl_family dp_ct_limit_genl_family __ro_after_init = {
.policy = ct_limit_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = ct_limit_genl_ops,
- .n_ops = ARRAY_SIZE(ct_limit_genl_ops),
+ .small_ops = ct_limit_genl_ops,
+ .n_small_ops = ARRAY_SIZE(ct_limit_genl_ops),
.mcgrps = &ovs_ct_limit_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 6e47ef7ef036..9d6ef6cb9b26 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -182,7 +182,7 @@ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
head = vport_hash_bucket(dp, port_no);
hlist_for_each_entry_rcu(vport, head, dp_hash_node,
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
if (vport->port_no == port_no)
return vport;
}
@@ -254,7 +254,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
error = ovs_execute_actions(dp, skb, sf_acts, key);
if (unlikely(error))
net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n",
- ovs_dp_name(dp), error);
+ ovs_dp_name(dp), error);
stats_counter = &stats->n_hit;
@@ -302,7 +302,7 @@ err:
static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info,
- uint32_t cutlen)
+ uint32_t cutlen)
{
unsigned int gso_type = skb_shinfo(skb)->gso_type;
struct sw_flow_key later_key;
@@ -652,7 +652,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
[OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 },
};
-static const struct genl_ops dp_packet_genl_ops[] = {
+static const struct genl_small_ops dp_packet_genl_ops[] = {
{ .cmd = OVS_PACKET_CMD_EXECUTE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -668,8 +668,8 @@ static struct genl_family dp_packet_genl_family __ro_after_init = {
.policy = packet_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = dp_packet_genl_ops,
- .n_ops = ARRAY_SIZE(dp_packet_genl_ops),
+ .small_ops = dp_packet_genl_ops,
+ .n_small_ops = ARRAY_SIZE(dp_packet_genl_ops),
.module = THIS_MODULE,
};
@@ -1080,11 +1080,12 @@ error:
}
/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
-static noinline_for_stack struct sw_flow_actions *get_flow_actions(struct net *net,
- const struct nlattr *a,
- const struct sw_flow_key *key,
- const struct sw_flow_mask *mask,
- bool log)
+static noinline_for_stack
+struct sw_flow_actions *get_flow_actions(struct net *net,
+ const struct nlattr *a,
+ const struct sw_flow_key *key,
+ const struct sw_flow_mask *mask,
+ bool log)
{
struct sw_flow_actions *acts;
struct sw_flow_key masked_key;
@@ -1383,7 +1384,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
ovs_notify(&dp_flow_genl_family, reply, info);
} else {
- netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply));
+ netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0,
+ PTR_ERR(reply));
}
}
@@ -1451,7 +1453,7 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
[OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 },
};
-static const struct genl_ops dp_flow_genl_ops[] = {
+static const struct genl_small_ops dp_flow_genl_ops[] = {
{ .cmd = OVS_FLOW_CMD_NEW,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -1483,8 +1485,8 @@ static struct genl_family dp_flow_genl_family __ro_after_init = {
.policy = flow_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = dp_flow_genl_ops,
- .n_ops = ARRAY_SIZE(dp_flow_genl_ops),
+ .small_ops = dp_flow_genl_ops,
+ .n_small_ops = ARRAY_SIZE(dp_flow_genl_ops),
.mcgrps = &ovs_dp_flow_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
@@ -1513,7 +1515,7 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
int err;
ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
- flags, cmd);
+ flags, cmd);
if (!ovs_header)
goto error;
@@ -1572,11 +1574,13 @@ static struct datapath *lookup_datapath(struct net *net,
return dp ? dp : ERR_PTR(-ENODEV);
}
-static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info)
+static void ovs_dp_reset_user_features(struct sk_buff *skb,
+ struct genl_info *info)
{
struct datapath *dp;
- dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr,
+ info->attrs);
if (IS_ERR(dp))
return;
@@ -1699,13 +1703,13 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
parms.port_no = OVSP_LOCAL;
parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
- err = ovs_dp_change(dp, a);
- if (err)
- goto err_destroy_meters;
-
/* So far only local changes have been made, now need the lock. */
ovs_lock();
+ err = ovs_dp_change(dp, a);
+ if (err)
+ goto err_unlock_and_destroy_meters;
+
vport = new_vport(&parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
@@ -1721,8 +1725,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
ovs_dp_reset_user_features(skb, info);
}
- ovs_unlock();
- goto err_destroy_meters;
+ goto err_unlock_and_destroy_meters;
}
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
@@ -1737,7 +1740,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
ovs_notify(&dp_datapath_genl_family, reply, info);
return 0;
-err_destroy_meters:
+err_unlock_and_destroy_meters:
+ ovs_unlock();
ovs_meters_exit(dp);
err_destroy_ports:
kfree(dp->ports);
@@ -1914,7 +1918,7 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)),
};
-static const struct genl_ops dp_datapath_genl_ops[] = {
+static const struct genl_small_ops dp_datapath_genl_ops[] = {
{ .cmd = OVS_DP_CMD_NEW,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -1946,8 +1950,8 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = {
.policy = datapath_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = dp_datapath_genl_ops,
- .n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
+ .small_ops = dp_datapath_genl_ops,
+ .n_small_ops = ARRAY_SIZE(dp_datapath_genl_ops),
.mcgrps = &ovs_dp_datapath_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
@@ -2075,7 +2079,7 @@ static unsigned int ovs_get_max_headroom(struct datapath *dp)
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node,
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
dev = vport->dev;
dev_headroom = netdev_get_fwd_headroom(dev);
if (dev_headroom > max_headroom)
@@ -2093,10 +2097,11 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom)
int i;
dp->max_headroom = new_headroom;
- for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node,
- lockdep_ovsl_is_held())
+ lockdep_ovsl_is_held())
netdev_set_rx_headroom(vport->dev, new_headroom);
+ }
}
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
@@ -2396,7 +2401,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
[OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 },
};
-static const struct genl_ops dp_vport_genl_ops[] = {
+static const struct genl_small_ops dp_vport_genl_ops[] = {
{ .cmd = OVS_VPORT_CMD_NEW,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -2428,8 +2433,8 @@ struct genl_family dp_vport_genl_family __ro_after_init = {
.policy = vport_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = dp_vport_genl_ops,
- .n_ops = ARRAY_SIZE(dp_vport_genl_ops),
+ .small_ops = dp_vport_genl_ops,
+ .n_small_ops = ARRAY_SIZE(dp_vport_genl_ops),
.mcgrps = &ovs_dp_vport_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
@@ -2476,13 +2481,19 @@ error:
static int __net_init ovs_init_net(struct net *net)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+ int err;
INIT_LIST_HEAD(&ovs_net->dps);
INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance);
+
+ err = ovs_ct_init(net);
+ if (err)
+ return err;
+
schedule_delayed_work(&ovs_net->masks_rebalance,
msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL));
- return ovs_ct_init(net);
+ return 0;
}
static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
@@ -2551,7 +2562,8 @@ static int __init dp_init(void)
{
int err;
- BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb));
+ BUILD_BUG_ON(sizeof(struct ovs_skb_cb) >
+ sizeof_field(struct sk_buff, cb));
pr_info("Open vSwitch switching datapath\n");
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index b03d142ec82e..c7f34d6a9934 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -294,6 +294,10 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
/**
* Parse vlan tag from vlan header.
+ * @skb: skb containing frame to parse
+ * @key_vh: pointer to parsed vlan tag
+ * @untag_vlan: should the vlan header be removed from the frame
+ *
* Returns ERROR on memory error.
* Returns 0 if it encounters a non-vlan or incomplete packet.
* Returns 1 after successfully parsing vlan tag.
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 9d3e50c4d29f..4c5c2331e764 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2503,28 +2503,42 @@ static int validate_and_copy_dec_ttl(struct net *net,
__be16 eth_type, __be16 vlan_tci,
u32 mpls_label_count, bool log)
{
- int start, err;
- u32 nested = true;
+ const struct nlattr *attrs[OVS_DEC_TTL_ATTR_MAX + 1];
+ int start, action_start, err, rem;
+ const struct nlattr *a, *actions;
+
+ memset(attrs, 0, sizeof(attrs));
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
- if (!nla_len(attr))
- return ovs_nla_add_action(sfa, OVS_ACTION_ATTR_DEC_TTL,
- NULL, 0, log);
+ /* Ignore unknown attributes to be future proof. */
+ if (type > OVS_DEC_TTL_ATTR_MAX)
+ continue;
+
+ if (!type || attrs[type])
+ return -EINVAL;
+
+ attrs[type] = a;
+ }
+
+ actions = attrs[OVS_DEC_TTL_ATTR_ACTION];
+ if (rem || !actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
+ return -EINVAL;
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log);
if (start < 0)
return start;
- err = ovs_nla_add_action(sfa, OVS_DEC_TTL_ATTR_ACTION, &nested,
- sizeof(nested), log);
+ action_start = add_nested_action_start(sfa, OVS_DEC_TTL_ATTR_ACTION, log);
+ if (action_start < 0)
+ return action_start;
- if (err)
- return err;
-
- err = __ovs_nla_copy_actions(net, attr, key, sfa, eth_type,
+ err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type,
vlan_tci, mpls_label_count, log);
if (err)
return err;
+ add_nested_action_end(*sfa, action_start);
add_nested_action_end(*sfa, start);
return 0;
}
@@ -3487,20 +3501,42 @@ out:
static int dec_ttl_action_to_attr(const struct nlattr *attr,
struct sk_buff *skb)
{
- int err = 0, rem = nla_len(attr);
- struct nlattr *start;
+ struct nlattr *start, *action_start;
+ const struct nlattr *a;
+ int err = 0, rem;
start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL);
-
if (!start)
return -EMSGSIZE;
- err = ovs_nla_put_actions(nla_data(attr), rem, skb);
- if (err)
- nla_nest_cancel(skb, start);
- else
- nla_nest_end(skb, start);
+ nla_for_each_attr(a, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(a)) {
+ case OVS_DEC_TTL_ATTR_ACTION:
+
+ action_start = nla_nest_start_noflag(skb, OVS_DEC_TTL_ATTR_ACTION);
+ if (!action_start) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
+ if (err)
+ goto out;
+
+ nla_nest_end(skb, action_start);
+ break;
+ default:
+ /* Ignore all other option to be future compatible */
+ break;
+ }
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+
+out:
+ nla_nest_cancel(skb, start);
return err;
}
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index e2235849a57e..c89c8da99f1a 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -111,12 +111,16 @@ static void flow_free(struct sw_flow *flow)
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
if (flow->sf_acts)
- ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
+ ovs_nla_free_flow_actions((struct sw_flow_actions __force *)
+ flow->sf_acts);
/* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask))
+ for (cpu = 0; cpu < nr_cpu_ids;
+ cpu = cpumask_next(cpu, &flow->cpu_used_mask)) {
if (flow->stats[cpu])
kmem_cache_free(flow_stats_cache,
(struct sw_flow_stats __force *)flow->stats[cpu]);
+ }
+
kmem_cache_free(flow_cache, flow);
}
@@ -164,7 +168,6 @@ static struct table_instance *table_instance_alloc(int new_size)
ti->n_buckets = new_size;
ti->node_ver = 0;
- ti->keep_flows = false;
get_random_bytes(&ti->hash_seed, sizeof(u32));
return ti;
@@ -172,7 +175,7 @@ static struct table_instance *table_instance_alloc(int new_size)
static void __mask_array_destroy(struct mask_array *ma)
{
- free_percpu(ma->masks_usage_cntr);
+ free_percpu(ma->masks_usage_stats);
kfree(ma);
}
@@ -192,19 +195,19 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
* zero based counter we store the value at reset, and subtract it
* later when processing.
*/
- for (i = 0; i < ma->max; i++) {
+ for (i = 0; i < ma->max; i++) {
ma->masks_usage_zero_cntr[i] = 0;
for_each_possible_cpu(cpu) {
- u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
- cpu);
+ struct mask_array_stats *stats;
unsigned int start;
u64 counter;
+ stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&ma->syncp);
- counter = usage_counters[i];
- } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ counter = stats->usage_cntrs[i];
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
ma->masks_usage_zero_cntr[i] += counter;
}
@@ -227,9 +230,10 @@ static struct mask_array *tbl_mask_array_alloc(int size)
sizeof(struct sw_flow_mask *) *
size);
- new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
- __alignof__(u64));
- if (!new->masks_usage_cntr) {
+ new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
+ sizeof(u64) * size,
+ __alignof__(u64));
+ if (!new->masks_usage_stats) {
kfree(new);
return NULL;
}
@@ -273,7 +277,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
if (ma_count >= ma->max) {
err = tbl_mask_array_realloc(tbl, ma->max +
- MASK_ARRAY_SIZE_MIN);
+ MASK_ARRAY_SIZE_MIN);
if (err)
return err;
@@ -288,7 +292,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
BUG_ON(ovsl_dereference(ma->masks[ma_count]));
rcu_assign_pointer(ma->masks[ma_count], new);
- WRITE_ONCE(ma->count, ma_count +1);
+ WRITE_ONCE(ma->count, ma_count + 1);
return 0;
}
@@ -309,10 +313,10 @@ static void tbl_mask_array_del_mask(struct flow_table *tbl,
return;
found:
- WRITE_ONCE(ma->count, ma_count -1);
+ WRITE_ONCE(ma->count, ma_count - 1);
- rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]);
- RCU_INIT_POINTER(ma->masks[ma_count -1], NULL);
+ rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]);
+ RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL);
kfree_rcu(mask, rcu);
@@ -386,7 +390,7 @@ static struct mask_cache *tbl_mask_cache_alloc(u32 size)
}
int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size)
{
- struct mask_cache *mc = rcu_dereference(table->mask_cache);
+ struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
struct mask_cache *new;
if (size == mc->cache_size)
@@ -448,26 +452,23 @@ free_mask_cache:
static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
{
- struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
+ struct table_instance *ti;
+ ti = container_of(rcu, struct table_instance, rcu);
__table_instance_destroy(ti);
}
static void table_instance_flow_free(struct flow_table *table,
- struct table_instance *ti,
- struct table_instance *ufid_ti,
- struct sw_flow *flow,
- bool count)
+ struct table_instance *ti,
+ struct table_instance *ufid_ti,
+ struct sw_flow *flow)
{
hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
- if (count)
- table->count--;
+ table->count--;
if (ovs_identifier_is_ufid(&flow->id)) {
hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
-
- if (count)
- table->ufid_count--;
+ table->ufid_count--;
}
flow_mask_remove(table, flow->mask);
@@ -480,22 +481,25 @@ void table_instance_flow_flush(struct flow_table *table,
{
int i;
- if (ti->keep_flows)
- return;
-
for (i = 0; i < ti->n_buckets; i++) {
- struct sw_flow *flow;
struct hlist_head *head = &ti->buckets[i];
struct hlist_node *n;
+ struct sw_flow *flow;
hlist_for_each_entry_safe(flow, n, head,
flow_table.node[ti->node_ver]) {
table_instance_flow_free(table, ti, ufid_ti,
- flow, false);
+ flow);
ovs_flow_free(flow, true);
}
}
+
+ if (WARN_ON(table->count != 0 ||
+ table->ufid_count != 0)) {
+ table->count = 0;
+ table->ufid_count = 0;
+ }
}
static void table_instance_destroy(struct table_instance *ti,
@@ -596,8 +600,6 @@ static void flow_table_copy_flows(struct table_instance *old,
lockdep_ovsl_is_held())
table_instance_insert(new, flow);
}
-
- old->keep_flows = true;
}
static struct table_instance *table_instance_rehash(struct table_instance *ti,
@@ -632,8 +634,6 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table)
rcu_assign_pointer(flow_table->ti, new_ti);
rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
flow_table->last_rehash = jiffies;
- flow_table->count = 0;
- flow_table->ufid_count = 0;
table_instance_flow_flush(flow_table, old_ti, old_ufid_ti);
table_instance_destroy(old_ti, old_ufid_ti);
@@ -661,7 +661,7 @@ static int flow_key_start(const struct sw_flow_key *key)
return 0;
else
return rounddown(offsetof(struct sw_flow_key, phy),
- sizeof(long));
+ sizeof(long));
}
static bool cmp_key(const struct sw_flow_key *key1,
@@ -673,7 +673,7 @@ static bool cmp_key(const struct sw_flow_key *key1,
long diffs = 0;
int i;
- for (i = key_start; i < key_end; i += sizeof(long))
+ for (i = key_start; i < key_end; i += sizeof(long))
diffs |= *cp1++ ^ *cp2++;
return diffs == 0;
@@ -713,7 +713,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
(*n_mask_hit)++;
hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver],
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
if (flow->mask == mask && flow->flow_table.hash == hash &&
flow_cmp_masked_key(flow, &masked_key, &mask->range))
return flow;
@@ -723,6 +723,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
/* Flow lookup does full lookup on flow table. It starts with
* mask from index passed in *index.
+ * This function MUST be called with BH disabled due to the use
+ * of CPU specific variables.
*/
static struct sw_flow *flow_lookup(struct flow_table *tbl,
struct table_instance *ti,
@@ -732,7 +734,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
u32 *n_cache_hit,
u32 *index)
{
- u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
+ struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
struct sw_flow *flow;
struct sw_flow_mask *mask;
int i;
@@ -742,9 +744,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
if (mask) {
flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
if (flow) {
- u64_stats_update_begin(&ma->syncp);
- usage_counters[*index]++;
- u64_stats_update_end(&ma->syncp);
+ u64_stats_update_begin(&stats->syncp);
+ stats->usage_cntrs[*index]++;
+ u64_stats_update_end(&stats->syncp);
(*n_cache_hit)++;
return flow;
}
@@ -763,9 +765,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
if (flow) { /* Found */
*index = i;
- u64_stats_update_begin(&ma->syncp);
- usage_counters[*index]++;
- u64_stats_update_end(&ma->syncp);
+ u64_stats_update_begin(&stats->syncp);
+ stats->usage_cntrs[*index]++;
+ u64_stats_update_end(&stats->syncp);
return flow;
}
}
@@ -851,9 +853,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
u32 __always_unused n_mask_hit;
u32 __always_unused n_cache_hit;
+ struct sw_flow *flow;
u32 index = 0;
- return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
+ /* This function gets called trough the netlink interface and therefore
+ * is preemptible. However, flow_lookup() function needs to be called
+ * with BH disabled due to CPU specific variables.
+ */
+ local_bh_disable();
+ flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
+ local_bh_enable();
+ return flow;
}
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
@@ -897,7 +907,8 @@ static bool ovs_flow_cmp_ufid(const struct sw_flow *flow,
return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len);
}
-bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match)
+bool ovs_flow_cmp(const struct sw_flow *flow,
+ const struct sw_flow_match *match)
{
if (ovs_identifier_is_ufid(&flow->id))
return flow_cmp_masked_key(flow, match->key, &match->range);
@@ -916,7 +927,7 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
hash = ufid_hash(ufid);
head = find_bucket(ti, hash);
hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver],
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
if (flow->ufid_table.hash == hash &&
ovs_flow_cmp_ufid(flow, ufid))
return flow;
@@ -950,7 +961,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
BUG_ON(table->count == 0);
- table_instance_flow_free(table, ti, ufid_ti, flow, true);
+ table_instance_flow_free(table, ti, ufid_ti, flow);
}
static struct sw_flow_mask *mask_alloc(void)
@@ -1107,9 +1118,8 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
if (!masks_and_count)
return;
- for (i = 0; i < ma->max; i++) {
+ for (i = 0; i < ma->max; i++) {
struct sw_flow_mask *mask;
- unsigned int start;
int cpu;
mask = rcu_dereference_ovsl(ma->masks[i]);
@@ -1120,14 +1130,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
masks_and_count[i].counter = 0;
for_each_possible_cpu(cpu) {
- u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
- cpu);
+ struct mask_array_stats *stats;
+ unsigned int start;
u64 counter;
+ stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&ma->syncp);
- counter = usage_counters[i];
- } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ counter = stats->usage_cntrs[i];
+ } while (u64_stats_fetch_retry_irq(&stats->syncp,
+ start));
masks_and_count[i].counter += counter;
}
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index 6e7d4ac59353..9e659db78c05 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -38,12 +38,16 @@ struct mask_count {
u64 counter;
};
+struct mask_array_stats {
+ struct u64_stats_sync syncp;
+ u64 usage_cntrs[];
+};
+
struct mask_array {
struct rcu_head rcu;
int count, max;
- u64 __percpu *masks_usage_cntr;
+ struct mask_array_stats __percpu *masks_usage_stats;
u64 *masks_usage_zero_cntr;
- struct u64_stats_sync syncp;
struct sw_flow_mask __rcu *masks[];
};
@@ -53,7 +57,6 @@ struct table_instance {
struct rcu_head rcu;
int node_ver;
u32 hash_seed;
- bool keep_flows;
};
struct flow_table {
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index 3d3d8e094546..15424d26e85d 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -423,7 +423,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
meter = dp_meter_create(a);
- if (IS_ERR_OR_NULL(meter))
+ if (IS_ERR(meter))
return PTR_ERR(meter);
reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET,
@@ -672,7 +672,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb,
return false;
}
-static struct genl_ops dp_meter_genl_ops[] = {
+static const struct genl_small_ops dp_meter_genl_ops[] = {
{ .cmd = OVS_METER_CMD_FEATURES,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = 0, /* OK for unprivileged users. */
@@ -711,8 +711,8 @@ struct genl_family dp_meter_genl_family __ro_after_init = {
.policy = meter_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = dp_meter_genl_ops,
- .n_ops = ARRAY_SIZE(dp_meter_genl_ops),
+ .small_ops = dp_meter_genl_ops,
+ .n_small_ops = ARRAY_SIZE(dp_meter_genl_ops),
.mcgrps = &ovs_meter_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 58a7b8312c28..5b2ee9c1c00b 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -35,21 +35,18 @@ internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
{
int len, err;
+ /* store len value because skb can be freed inside ovs_vport_receive() */
len = skb->len;
+
rcu_read_lock();
err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL);
rcu_read_unlock();
- if (likely(!err)) {
- struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats);
-
- u64_stats_update_begin(&tstats->syncp);
- tstats->tx_bytes += len;
- tstats->tx_packets++;
- u64_stats_update_end(&tstats->syncp);
- } else {
+ if (likely(!err))
+ dev_sw_netstats_tx_add(netdev, 1, len);
+ else
netdev->stats.tx_errors++;
- }
+
return NETDEV_TX_OK;
}
@@ -83,42 +80,12 @@ static void internal_dev_destructor(struct net_device *dev)
ovs_vport_free(vport);
}
-static void
-internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
-{
- int i;
-
- memset(stats, 0, sizeof(*stats));
- stats->rx_errors = dev->stats.rx_errors;
- stats->tx_errors = dev->stats.tx_errors;
- stats->tx_dropped = dev->stats.tx_dropped;
- stats->rx_dropped = dev->stats.rx_dropped;
-
- for_each_possible_cpu(i) {
- const struct pcpu_sw_netstats *percpu_stats;
- struct pcpu_sw_netstats local_stats;
- unsigned int start;
-
- percpu_stats = per_cpu_ptr(dev->tstats, i);
-
- do {
- start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
- local_stats = *percpu_stats;
- } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
-
- stats->rx_bytes += local_stats.rx_bytes;
- stats->rx_packets += local_stats.rx_packets;
- stats->tx_bytes += local_stats.tx_bytes;
- stats->tx_packets += local_stats.tx_packets;
- }
-}
-
static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_open = internal_dev_open,
.ndo_stop = internal_dev_stop,
.ndo_start_xmit = internal_dev_xmit,
.ndo_set_mac_address = eth_mac_addr,
- .ndo_get_stats64 = internal_get_stats,
+ .ndo_get_stats64 = dev_get_tstats64,
};
static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -225,7 +192,6 @@ static void internal_dev_destroy(struct vport *vport)
static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
{
struct net_device *netdev = skb->dev;
- struct pcpu_sw_netstats *stats;
if (unlikely(!(netdev->flags & IFF_UP))) {
kfree_skb(skb);
@@ -240,12 +206,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, netdev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
-
- stats = this_cpu_ptr(netdev->tstats);
- u64_stats_update_begin(&stats->syncp);
- stats->rx_packets++;
- stats->rx_bytes += skb->len;
- u64_stats_update_end(&stats->syncp);
+ dev_sw_netstats_rx_add(netdev, skb->len);
netif_rx(skb);
return NETDEV_TX_OK;
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 0d44c5c013fa..4ed7e52c7012 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -98,7 +98,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
struct vport *vport;
hlist_for_each_entry_rcu(vport, bucket, hash_node,
- lockdep_ovsl_is_held())
+ lockdep_ovsl_is_held())
if (!strcmp(name, ovs_vport_name(vport)) &&
net_eq(ovs_dp_get_net(vport->dp), net))
return vport;
@@ -111,14 +111,16 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
*
* @priv_size: Size of private data area to allocate.
* @ops: vport device ops
+ * @parms: information about new vport.
*
* Allocate and initialize a new vport defined by @ops. The vport will contain
* a private data area of size @priv_size that can be accessed using
- * vport_priv(). vports that are no longer needed should be released with
+ * vport_priv(). Some parameters of the vport will be initialized from @parms.
+ * @vports that are no longer needed should be released with
* vport_free().
*/
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
- const struct vport_parms *parms)
+ const struct vport_parms *parms)
{
struct vport *vport;
size_t alloc_size;
@@ -397,7 +399,8 @@ int ovs_vport_get_upcall_portids(const struct vport *vport,
*
* Returns the portid of the target socket. Must be called with rcu_read_lock.
*/
-u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
+u32 ovs_vport_find_upcall_portid(const struct vport *vport,
+ struct sk_buff *skb)
{
struct vport_portids *ids;
u32 ids_index;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 2b33e977a905..6bbc7a448593 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -46,6 +46,7 @@
* Copyright (C) 2011, <lokec@ccs.neu.edu>
*/
+#include <linux/ethtool.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/capability.h>
@@ -93,52 +94,56 @@
/*
Assumptions:
- - if device has no dev->hard_header routine, it adds and removes ll header
- inside itself. In this case ll header is invisible outside of device,
- but higher levels still should reserve dev->hard_header_len.
- Some devices are enough clever to reallocate skb, when header
- will not fit to reserved space (tunnel), another ones are silly
- (PPP).
+ - If the device has no dev->header_ops->create, there is no LL header
+ visible above the device. In this case, its hard_header_len should be 0.
+ The device may prepend its own header internally. In this case, its
+ needed_headroom should be set to the space needed for it to add its
+ internal header.
+ For example, a WiFi driver pretending to be an Ethernet driver should
+ set its hard_header_len to be the Ethernet header length, and set its
+ needed_headroom to be (the real WiFi header length - the fake Ethernet
+ header length).
- packet socket receives packets with pulled ll header,
so that SOCK_RAW should push it back.
On receive:
-----------
-Incoming, dev->hard_header!=NULL
+Incoming, dev_has_header(dev) == true
mac_header -> ll header
data -> data
-Outgoing, dev->hard_header!=NULL
+Outgoing, dev_has_header(dev) == true
mac_header -> ll header
data -> ll header
-Incoming, dev->hard_header==NULL
- mac_header -> UNKNOWN position. It is very likely, that it points to ll
- header. PPP makes it, that is wrong, because introduce
- assymetry between rx and tx paths.
+Incoming, dev_has_header(dev) == false
+ mac_header -> data
+ However drivers often make it point to the ll header.
+ This is incorrect because the ll header should be invisible to us.
data -> data
-Outgoing, dev->hard_header==NULL
- mac_header -> data. ll header is still not built!
+Outgoing, dev_has_header(dev) == false
+ mac_header -> data. ll header is invisible to us.
data -> data
Resume
- If dev->hard_header==NULL we are unlikely to restore sensible ll header.
+ If dev_has_header(dev) == false we are unable to restore the ll header,
+ because it is invisible to us.
On transmit:
------------
-dev->hard_header != NULL
+dev->header_ops != NULL
mac_header -> ll header
data -> ll header
-dev->hard_header == NULL (ll header is added by device, we cannot control it)
+dev->header_ops == NULL (ll header is invisible to us)
mac_header -> data
data -> data
- We should set nh.raw on output to correct posistion,
+ We should set network_header on output to the correct position,
packet classifier depends on it.
*/
@@ -177,7 +182,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
-#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
struct packet_sock;
static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -1633,13 +1637,15 @@ static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
return false;
}
-static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
+static int fanout_add(struct sock *sk, struct fanout_args *args)
{
struct packet_rollover *rollover = NULL;
struct packet_sock *po = pkt_sk(sk);
+ u16 type_flags = args->type_flags;
struct packet_fanout *f, *match;
u8 type = type_flags & 0xff;
u8 flags = type_flags >> 8;
+ u16 id = args->id;
int err;
switch (type) {
@@ -1697,11 +1703,21 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
}
}
err = -EINVAL;
- if (match && match->flags != flags)
- goto out;
- if (!match) {
+ if (match) {
+ if (match->flags != flags)
+ goto out;
+ if (args->max_num_members &&
+ args->max_num_members != match->max_num_members)
+ goto out;
+ } else {
+ if (args->max_num_members > PACKET_FANOUT_MAX)
+ goto out;
+ if (!args->max_num_members)
+ /* legacy PACKET_FANOUT_MAX */
+ args->max_num_members = 256;
err = -ENOMEM;
- match = kzalloc(sizeof(*match), GFP_KERNEL);
+ match = kvzalloc(struct_size(match, arr, args->max_num_members),
+ GFP_KERNEL);
if (!match)
goto out;
write_pnet(&match->net, sock_net(sk));
@@ -1717,6 +1733,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
match->prot_hook.func = packet_rcv_fanout;
match->prot_hook.af_packet_priv = match;
match->prot_hook.id_match = match_fanout_group;
+ match->max_num_members = args->max_num_members;
list_add(&match->list, &fanout_list);
}
err = -EINVAL;
@@ -1727,7 +1744,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
match->prot_hook.type == po->prot_hook.type &&
match->prot_hook.dev == po->prot_hook.dev) {
err = -ENOSPC;
- if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
+ if (refcount_read(&match->sk_ref) < match->max_num_members) {
__dev_remove_pack(&po->prot_hook);
po->fanout = match;
po->rollover = rollover;
@@ -1741,7 +1758,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
if (err && !refcount_read(&match->sk_ref)) {
list_del(&match->list);
- kfree(match);
+ kvfree(match);
}
out:
@@ -2066,7 +2083,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
- if (dev->header_ops) {
+ if (dev_has_header(dev)) {
/* The device has an explicit notion of ll header,
* exported to higher levels.
*
@@ -2195,7 +2212,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!net_eq(dev_net(dev), sock_net(sk)))
goto drop;
- if (dev->header_ops) {
+ if (dev_has_header(dev)) {
if (sk->sk_type != SOCK_DGRAM)
skb_push(skb, skb->data - skb_mac_header(skb));
else if (skb->pkt_type == PACKET_OUTGOING) {
@@ -3072,7 +3089,7 @@ static int packet_release(struct socket *sock)
kfree(po->rollover);
if (f) {
fanout_release_data(f);
- kfree(f);
+ kvfree(f);
}
/*
* Now the socket is dead. No more input will appear.
@@ -3863,14 +3880,14 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
}
case PACKET_FANOUT:
{
- int val;
+ struct fanout_args args = { 0 };
- if (optlen != sizeof(val))
+ if (optlen != sizeof(int) && optlen != sizeof(args))
return -EINVAL;
- if (copy_from_sockptr(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&args, optval, optlen))
return -EFAULT;
- return fanout_add(sk, val & 0xffff, val >> 16);
+ return fanout_add(sk, &args);
}
case PACKET_FANOUT_DATA:
{
@@ -4578,7 +4595,9 @@ static void packet_seq_stop(struct seq_file *seq, void *v)
static int packet_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
- seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
+ seq_printf(seq,
+ "%*sRefCnt Type Proto Iface R Rmem User Inode\n",
+ IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
else {
struct sock *s = sk_entry(v);
const struct packet_sock *po = pkt_sk(s);
@@ -4612,9 +4631,11 @@ static int __net_init packet_net_init(struct net *net)
mutex_init(&net->packet.sklist_lock);
INIT_HLIST_HEAD(&net->packet.sklist);
+#ifdef CONFIG_PROC_FS
if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
sizeof(struct seq_net_private)))
return -ENOMEM;
+#endif /* CONFIG_PROC_FS */
return 0;
}
diff --git a/net/packet/internal.h b/net/packet/internal.h
index fd41ecb7f605..baafc3f3fa25 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -77,11 +77,12 @@ struct packet_ring_buffer {
};
extern struct mutex fanout_mutex;
-#define PACKET_FANOUT_MAX 256
+#define PACKET_FANOUT_MAX (1 << 16)
struct packet_fanout {
possible_net_t net;
unsigned int num_members;
+ u32 max_num_members;
u16 id;
u8 type;
u8 flags;
@@ -90,10 +91,10 @@ struct packet_fanout {
struct bpf_prog __rcu *bpf_prog;
};
struct list_head list;
- struct sock *arr[PACKET_FANOUT_MAX];
spinlock_t lock;
refcount_t sk_ref;
struct packet_type prot_hook ____cacheline_aligned_in_smp;
+ struct sock *arr[];
};
struct packet_rollover {
diff --git a/net/psample/psample.c b/net/psample/psample.c
index a042261a45c5..33e238c965bd 100644
--- a/net/psample/psample.c
+++ b/net/psample/psample.c
@@ -96,7 +96,7 @@ static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg,
return msg->len;
}
-static const struct genl_ops psample_nl_ops[] = {
+static const struct genl_small_ops psample_nl_ops[] = {
{
.cmd = PSAMPLE_CMD_GET_GROUP,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -112,8 +112,8 @@ static struct genl_family psample_nl_family __ro_after_init = {
.netnsok = true,
.module = THIS_MODULE,
.mcgrps = psample_nl_mcgrps,
- .ops = psample_nl_ops,
- .n_ops = ARRAY_SIZE(psample_nl_ops),
+ .small_ops = psample_nl_ops,
+ .n_small_ops = ARRAY_SIZE(psample_nl_ops),
.n_mcgrps = ARRAY_SIZE(psample_nl_mcgrps),
};
diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c
index ff0c41467fc1..2bf2b1943e61 100644
--- a/net/qrtr/mhi.c
+++ b/net/qrtr/mhi.c
@@ -76,6 +76,11 @@ static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev,
struct qrtr_mhi_dev *qdev;
int rc;
+ /* start channels */
+ rc = mhi_prepare_for_transfer(mhi_dev);
+ if (rc)
+ return rc;
+
qdev = devm_kzalloc(&mhi_dev->dev, sizeof(*qdev), GFP_KERNEL);
if (!qdev)
return -ENOMEM;
@@ -99,6 +104,7 @@ static void qcom_mhi_qrtr_remove(struct mhi_device *mhi_dev)
struct qrtr_mhi_dev *qdev = dev_get_drvdata(&mhi_dev->dev);
qrtr_endpoint_unregister(&qdev->ep);
+ mhi_unprepare_from_transfer(mhi_dev);
dev_set_drvdata(&mhi_dev->dev, NULL);
}
diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c
index b8559c882431..8d00dfe8139e 100644
--- a/net/qrtr/ns.c
+++ b/net/qrtr/ns.c
@@ -517,10 +517,6 @@ static int ctrl_cmd_new_server(struct sockaddr_qrtr *from,
port = from->sq_port;
}
- /* Don't accept spoofed messages */
- if (from->sq_node != node_id)
- return -EINVAL;
-
srv = server_add(service, instance, node_id, port);
if (!srv)
return -EINVAL;
@@ -559,10 +555,6 @@ static int ctrl_cmd_del_server(struct sockaddr_qrtr *from,
port = from->sq_port;
}
- /* Don't accept spoofed messages */
- if (from->sq_node != node_id)
- return -EINVAL;
-
/* Local servers may only unregister themselves */
if (from->sq_node == qrtr_ns.local_node && from->sq_port != port)
return -EINVAL;
@@ -763,7 +755,7 @@ static void qrtr_ns_data_ready(struct sock *sk)
queue_work(qrtr_ns.workqueue, &qrtr_ns.work);
}
-void qrtr_ns_init(void)
+int qrtr_ns_init(void)
{
struct sockaddr_qrtr sq;
int ret;
@@ -774,7 +766,7 @@ void qrtr_ns_init(void)
ret = sock_create_kern(&init_net, AF_QIPCRTR, SOCK_DGRAM,
PF_QIPCRTR, &qrtr_ns.sock);
if (ret < 0)
- return;
+ return ret;
ret = kernel_getsockname(qrtr_ns.sock, (struct sockaddr *)&sq);
if (ret < 0) {
@@ -805,12 +797,13 @@ void qrtr_ns_init(void)
if (ret < 0)
goto err_wq;
- return;
+ return 0;
err_wq:
destroy_workqueue(qrtr_ns.workqueue);
err_sock:
sock_release(qrtr_ns.sock);
+ return ret;
}
EXPORT_SYMBOL_GPL(qrtr_ns_init);
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 957aa9263ba4..b34358282f37 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -171,8 +171,13 @@ static void __qrtr_node_release(struct kref *kref)
void __rcu **slot;
spin_lock_irqsave(&qrtr_nodes_lock, flags);
- if (node->nid != QRTR_EP_NID_AUTO)
- radix_tree_delete(&qrtr_nodes, node->nid);
+ /* If the node is a bridge for other nodes, there are possibly
+ * multiple entries pointing to our released node, delete them all.
+ */
+ radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) {
+ if (*slot == node)
+ radix_tree_iter_delete(&qrtr_nodes, &iter, slot);
+ }
spin_unlock_irqrestore(&qrtr_nodes_lock, flags);
list_del(&node->item);
@@ -347,7 +352,7 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb,
hdr->src_port_id = cpu_to_le32(from->sq_port);
if (to->sq_port == QRTR_PORT_CTRL) {
hdr->dst_node_id = cpu_to_le32(node->nid);
- hdr->dst_port_id = cpu_to_le32(QRTR_NODE_BCAST);
+ hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL);
} else {
hdr->dst_node_id = cpu_to_le32(to->sq_node);
hdr->dst_port_id = cpu_to_le32(to->sq_port);
@@ -401,12 +406,13 @@ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid)
{
unsigned long flags;
- if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO)
+ if (nid == QRTR_EP_NID_AUTO)
return;
spin_lock_irqsave(&qrtr_nodes_lock, flags);
radix_tree_insert(&qrtr_nodes, nid, node);
- node->nid = nid;
+ if (node->nid == QRTR_EP_NID_AUTO)
+ node->nid = nid;
spin_unlock_irqrestore(&qrtr_nodes_lock, flags);
}
@@ -494,6 +500,13 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len)
qrtr_node_assign(node, cb->src_node);
+ if (cb->type == QRTR_TYPE_NEW_SERVER) {
+ /* Remote node endpoint can bridge other distant nodes */
+ const struct qrtr_ctrl_pkt *pkt = data + hdrlen;
+
+ qrtr_node_assign(node, le32_to_cpu(pkt->server.node));
+ }
+
if (cb->type == QRTR_TYPE_RESUME_TX) {
qrtr_tx_resume(node, skb);
} else {
@@ -519,18 +532,20 @@ EXPORT_SYMBOL_GPL(qrtr_endpoint_post);
/**
* qrtr_alloc_ctrl_packet() - allocate control packet skb
* @pkt: reference to qrtr_ctrl_pkt pointer
+ * @flags: the type of memory to allocate
*
* Returns newly allocated sk_buff, or NULL on failure
*
* This function allocates a sk_buff large enough to carry a qrtr_ctrl_pkt and
* on success returns a reference to the control packet in @pkt.
*/
-static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt)
+static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt,
+ gfp_t flags)
{
const int pkt_len = sizeof(struct qrtr_ctrl_pkt);
struct sk_buff *skb;
- skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, GFP_KERNEL);
+ skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, flags);
if (!skb)
return NULL;
@@ -592,6 +607,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep)
struct qrtr_ctrl_pkt *pkt;
struct qrtr_tx_flow *flow;
struct sk_buff *skb;
+ unsigned long flags;
void __rcu **slot;
mutex_lock(&node->ep_lock);
@@ -599,11 +615,18 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep)
mutex_unlock(&node->ep_lock);
/* Notify the local controller about the event */
- skb = qrtr_alloc_ctrl_packet(&pkt);
- if (skb) {
- pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE);
- qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst);
+ spin_lock_irqsave(&qrtr_nodes_lock, flags);
+ radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) {
+ if (*slot != node)
+ continue;
+ src.sq_node = iter.index;
+ skb = qrtr_alloc_ctrl_packet(&pkt, GFP_ATOMIC);
+ if (skb) {
+ pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE);
+ qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst);
+ }
}
+ spin_unlock_irqrestore(&qrtr_nodes_lock, flags);
/* Wake up any transmitters waiting for resume-tx from the node */
mutex_lock(&node->qrtr_tx_lock);
@@ -656,7 +679,7 @@ static void qrtr_port_remove(struct qrtr_sock *ipc)
to.sq_node = QRTR_NODE_BCAST;
to.sq_port = QRTR_PORT_CTRL;
- skb = qrtr_alloc_ctrl_packet(&pkt);
+ skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL);
if (skb) {
pkt->cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT);
pkt->client.node = cpu_to_le32(ipc->us.sq_node);
@@ -982,7 +1005,7 @@ static int qrtr_send_resume_tx(struct qrtr_cb *cb)
if (!node)
return -EINVAL;
- skb = qrtr_alloc_ctrl_packet(&pkt);
+ skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL);
if (!skb)
return -ENOMEM;
@@ -1264,13 +1287,19 @@ static int __init qrtr_proto_init(void)
return rc;
rc = sock_register(&qrtr_family);
- if (rc) {
- proto_unregister(&qrtr_proto);
- return rc;
- }
+ if (rc)
+ goto err_proto;
- qrtr_ns_init();
+ rc = qrtr_ns_init();
+ if (rc)
+ goto err_sock;
+ return 0;
+
+err_sock:
+ sock_unregister(qrtr_family.family);
+err_proto:
+ proto_unregister(&qrtr_proto);
return rc;
}
postcore_initcall(qrtr_proto_init);
diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h
index dc2b67f17927..3f2d28696062 100644
--- a/net/qrtr/qrtr.h
+++ b/net/qrtr/qrtr.h
@@ -29,7 +29,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep);
int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len);
-void qrtr_ns_init(void);
+int qrtr_ns_init(void);
void qrtr_ns_remove(void);
diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
index 15ce9b642b25..b238c40a9984 100644
--- a/net/qrtr/tun.c
+++ b/net/qrtr/tun.c
@@ -80,6 +80,12 @@ static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
void *kbuf;
+ if (!len)
+ return -EINVAL;
+
+ if (len > KMALLOC_MAX_SIZE)
+ return -ENOMEM;
+
kbuf = kzalloc(len, GFP_KERNEL);
if (!kbuf)
return -ENOMEM;
diff --git a/net/rds/cong.c b/net/rds/cong.c
index ccdff09a79c8..8b689ebbd5b5 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -236,7 +236,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
* tcp_setsockopt and/or tcp_sendmsg will deadlock
* when it tries to get the sock_lock())
* 2. Interrupts are masked so that we can mark the
- * the port congested from both send and recv paths.
+ * port congested from both send and recv paths.
* (See comment around declaration of rdc_cong_lock).
* An attempt to get the sock_lock() here will
* therefore trigger warnings.
diff --git a/net/rds/ib.c b/net/rds/ib.c
index deecbdcdae84..24c9a9005a6f 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*
*/
-#include <linux/dmapool.h>
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/if.h>
@@ -108,7 +107,6 @@ static void rds_ib_dev_free(struct work_struct *work)
rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
if (rds_ibdev->pd)
ib_dealloc_pd(rds_ibdev->pd);
- dma_pool_destroy(rds_ibdev->rid_hdrs_pool);
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
list_del(&i_ipaddr->list);
@@ -191,14 +189,6 @@ static int rds_ib_add_one(struct ib_device *device)
rds_ibdev->pd = NULL;
goto put_dev;
}
- rds_ibdev->rid_hdrs_pool = dma_pool_create(device->name,
- device->dma_device,
- sizeof(struct rds_header),
- L1_CACHE_BYTES, 0);
- if (!rds_ibdev->rid_hdrs_pool) {
- ret = -ENOMEM;
- goto put_dev;
- }
rds_ibdev->mr_1m_pool =
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 8dfff43cf07f..2ba71102b1f1 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -246,7 +246,6 @@ struct rds_ib_device {
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
- struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */
u8 odp_capable:1;
unsigned int max_mrs;
@@ -264,13 +263,6 @@ struct rds_ib_device {
int *vector_load;
};
-static inline int ibdev_to_node(struct ib_device *ibdev)
-{
- struct device *parent;
-
- parent = ibdev->dev.parent;
- return parent ? dev_to_node(parent) : NUMA_NO_NODE;
-}
#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
/* bits for i_ack_flags */
@@ -387,11 +379,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
void rds_ib_cm_connect_complete(struct rds_connection *conn,
struct rdma_cm_event *event);
-struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
- struct dma_pool *pool,
- dma_addr_t **dma_addrs, u32 num_hdrs);
-void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs,
- dma_addr_t *dma_addrs, u32 num_hdrs);
#define rds_ib_conn_error(conn, fmt...) \
__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index c3319ff3ee11..f5cbe963cd8f 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*
*/
-#include <linux/dmapool.h>
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/slab.h>
@@ -441,42 +440,87 @@ static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
rds_ibdev->vector_load[index]--;
}
+static void rds_dma_hdr_free(struct ib_device *dev, struct rds_header *hdr,
+ dma_addr_t dma_addr, enum dma_data_direction dir)
+{
+ ib_dma_unmap_single(dev, dma_addr, sizeof(*hdr), dir);
+ kfree(hdr);
+}
+
+static struct rds_header *rds_dma_hdr_alloc(struct ib_device *dev,
+ dma_addr_t *dma_addr, enum dma_data_direction dir)
+{
+ struct rds_header *hdr;
+
+ hdr = kzalloc_node(sizeof(*hdr), GFP_KERNEL, ibdev_to_node(dev));
+ if (!hdr)
+ return NULL;
+
+ *dma_addr = ib_dma_map_single(dev, hdr, sizeof(*hdr),
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(dev, *dma_addr)) {
+ kfree(hdr);
+ return NULL;
+ }
+
+ return hdr;
+}
+
+/* Free the DMA memory used to store struct rds_header.
+ *
+ * @dev: the RDS IB device
+ * @hdrs: pointer to the array storing DMA memory pointers
+ * @dma_addrs: pointer to the array storing DMA addresses
+ * @num_hdars: number of headers to free.
+ */
+static void rds_dma_hdrs_free(struct rds_ib_device *dev,
+ struct rds_header **hdrs, dma_addr_t *dma_addrs, u32 num_hdrs,
+ enum dma_data_direction dir)
+{
+ u32 i;
+
+ for (i = 0; i < num_hdrs; i++)
+ rds_dma_hdr_free(dev->dev, hdrs[i], dma_addrs[i], dir);
+ kvfree(hdrs);
+ kvfree(dma_addrs);
+}
+
+
/* Allocate DMA coherent memory to be used to store struct rds_header for
* sending/receiving packets. The pointers to the DMA memory and the
* associated DMA addresses are stored in two arrays.
*
- * @ibdev: the IB device
- * @pool: the DMA memory pool
+ * @dev: the RDS IB device
* @dma_addrs: pointer to the array for storing DMA addresses
* @num_hdrs: number of headers to allocate
*
* It returns the pointer to the array storing the DMA memory pointers. On
* error, NULL pointer is returned.
*/
-struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
- struct dma_pool *pool,
- dma_addr_t **dma_addrs, u32 num_hdrs)
+static struct rds_header **rds_dma_hdrs_alloc(struct rds_ib_device *dev,
+ dma_addr_t **dma_addrs, u32 num_hdrs,
+ enum dma_data_direction dir)
{
struct rds_header **hdrs;
dma_addr_t *hdr_daddrs;
u32 i;
hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL,
- ibdev_to_node(ibdev));
+ ibdev_to_node(dev->dev));
if (!hdrs)
return NULL;
hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL,
- ibdev_to_node(ibdev));
+ ibdev_to_node(dev->dev));
if (!hdr_daddrs) {
kvfree(hdrs);
return NULL;
}
for (i = 0; i < num_hdrs; i++) {
- hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]);
+ hdrs[i] = rds_dma_hdr_alloc(dev->dev, &hdr_daddrs[i], dir);
if (!hdrs[i]) {
- rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i);
+ rds_dma_hdrs_free(dev, hdrs, hdr_daddrs, i, dir);
return NULL;
}
}
@@ -485,24 +529,6 @@ struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
return hdrs;
}
-/* Free the DMA memory used to store struct rds_header.
- *
- * @pool: the DMA memory pool
- * @hdrs: pointer to the array storing DMA memory pointers
- * @dma_addrs: pointer to the array storing DMA addresses
- * @num_hdars: number of headers to free.
- */
-void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs,
- dma_addr_t *dma_addrs, u32 num_hdrs)
-{
- u32 i;
-
- for (i = 0; i < num_hdrs; i++)
- dma_pool_free(pool, hdrs[i], dma_addrs[i]);
- kvfree(hdrs);
- kvfree(dma_addrs);
-}
-
/*
* This needs to be very careful to not leave IS_ERR pointers around for
* cleanup to trip over.
@@ -516,7 +542,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct rds_ib_device *rds_ibdev;
unsigned long max_wrs;
int ret, fr_queue_space;
- struct dma_pool *pool;
/*
* It's normal to see a null device if an incoming connection races
@@ -612,25 +637,26 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
goto recv_cq_out;
}
- pool = rds_ibdev->rid_hdrs_pool;
- ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma,
- ic->i_send_ring.w_nr);
+ ic->i_send_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr,
+ DMA_TO_DEVICE);
if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("DMA send hdrs alloc failed\n");
goto qp_out;
}
- ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma,
- ic->i_recv_ring.w_nr);
+ ic->i_recv_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr,
+ DMA_FROM_DEVICE);
if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("DMA recv hdrs alloc failed\n");
goto send_hdrs_dma_out;
}
- ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL,
- &ic->i_ack_dma);
+ ic->i_ack = rds_dma_hdr_alloc(rds_ibdev->dev, &ic->i_ack_dma,
+ DMA_TO_DEVICE);
if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("DMA ack header alloc failed\n");
@@ -666,18 +692,19 @@ sends_out:
vfree(ic->i_sends);
ack_dma_out:
- dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
+ rds_dma_hdr_free(rds_ibdev->dev, ic->i_ack, ic->i_ack_dma,
+ DMA_TO_DEVICE);
ic->i_ack = NULL;
recv_hdrs_dma_out:
- rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
- ic->i_recv_ring.w_nr);
+ rds_dma_hdrs_free(rds_ibdev, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr, DMA_FROM_DEVICE);
ic->i_recv_hdrs = NULL;
ic->i_recv_hdrs_dma = NULL;
send_hdrs_dma_out:
- rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma,
- ic->i_send_ring.w_nr);
+ rds_dma_hdrs_free(rds_ibdev, ic->i_send_hdrs, ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr, DMA_TO_DEVICE);
ic->i_send_hdrs = NULL;
ic->i_send_hdrs_dma = NULL;
@@ -711,7 +738,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
* original size. The only way to tell the difference is by looking at
* the contents, which are initialized to zero.
* If the protocol version fields aren't set, this is a connection attempt
- * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+ * from an older version. This could be 3.0 or 2.0 - we can't tell.
* We really should have changed this for OFED 1.3 :-(
*/
@@ -956,9 +983,10 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
conn->c_proposed_version,
UINT_MAX, UINT_MAX, isv6);
- ret = rdma_connect(cm_id, &conn_param);
+ ret = rdma_connect_locked(cm_id, &conn_param);
if (ret)
- rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+ rds_ib_conn_error(conn, "rdma_connect_locked failed (%d)\n",
+ ret);
out:
/* Beware - returning non-zero tells the rdma_cm to destroy
@@ -1109,29 +1137,30 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
}
if (ic->rds_ibdev) {
- struct dma_pool *pool;
-
- pool = ic->rds_ibdev->rid_hdrs_pool;
-
/* then free the resources that ib callbacks use */
if (ic->i_send_hdrs) {
- rds_dma_hdrs_free(pool, ic->i_send_hdrs,
+ rds_dma_hdrs_free(ic->rds_ibdev,
+ ic->i_send_hdrs,
ic->i_send_hdrs_dma,
- ic->i_send_ring.w_nr);
+ ic->i_send_ring.w_nr,
+ DMA_TO_DEVICE);
ic->i_send_hdrs = NULL;
ic->i_send_hdrs_dma = NULL;
}
if (ic->i_recv_hdrs) {
- rds_dma_hdrs_free(pool, ic->i_recv_hdrs,
+ rds_dma_hdrs_free(ic->rds_ibdev,
+ ic->i_recv_hdrs,
ic->i_recv_hdrs_dma,
- ic->i_recv_ring.w_nr);
+ ic->i_recv_ring.w_nr,
+ DMA_FROM_DEVICE);
ic->i_recv_hdrs = NULL;
ic->i_recv_hdrs_dma = NULL;
}
if (ic->i_ack) {
- dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
+ rds_dma_hdr_free(ic->rds_ibdev->dev, ic->i_ack,
+ ic->i_ack_dma, DMA_TO_DEVICE);
ic->i_ack = NULL;
}
} else {
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 694d411dc72f..6fdedd9dbbc2 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -310,8 +310,8 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
struct rds_ib_connection *ic = conn->c_transport_data;
struct ib_sge *sge;
int ret = -ENOMEM;
- gfp_t slab_mask = GFP_NOWAIT;
- gfp_t page_mask = GFP_NOWAIT;
+ gfp_t slab_mask = gfp;
+ gfp_t page_mask = gfp;
if (gfp & __GFP_DIRECT_RECLAIM) {
slab_mask = GFP_KERNEL;
@@ -662,10 +662,16 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi
seq = rds_ib_get_ack(ic);
rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, ic->i_ack_dma,
+ sizeof(*hdr), DMA_TO_DEVICE);
rds_message_populate_header(hdr, 0, 0, 0);
hdr->h_ack = cpu_to_be64(seq);
hdr->h_credit = adv_credits;
rds_message_make_checksum(hdr);
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev, ic->i_ack_dma,
+ sizeof(*hdr), DMA_TO_DEVICE);
+
ic->i_ack_queued = jiffies;
ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL);
@@ -845,6 +851,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_incoming *ibinc = ic->i_ibinc;
struct rds_header *ihdr, *hdr;
+ dma_addr_t dma_addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
/* XXX shut down the connection if port 0,0 are seen? */
@@ -863,6 +870,8 @@ static void rds_ib_process_recv(struct rds_connection *conn,
ihdr = ic->i_recv_hdrs[recv - ic->i_recvs];
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, dma_addr,
+ sizeof(*ihdr), DMA_FROM_DEVICE);
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
rds_ib_conn_error(conn, "incoming message "
@@ -870,7 +879,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
"forcing a reconnect\n",
&conn->c_faddr);
rds_stats_inc(s_recv_drop_bad_checksum);
- return;
+ goto done;
}
/* Process the ACK sequence which comes with every packet */
@@ -899,7 +908,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
*/
rds_ib_frag_free(ic, recv->r_frag);
recv->r_frag = NULL;
- return;
+ goto done;
}
/*
@@ -933,7 +942,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
hdr->h_dport != ihdr->h_dport) {
rds_ib_conn_error(conn,
"fragment header mismatch; forcing reconnect\n");
- return;
+ goto done;
}
}
@@ -965,6 +974,9 @@ static void rds_ib_process_recv(struct rds_connection *conn,
rds_inc_put(&ibinc->ii_inc);
}
+done:
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev, dma_addr,
+ sizeof(*ihdr), DMA_FROM_DEVICE);
}
void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
@@ -1020,7 +1032,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
rds_ib_stats_inc(s_ib_rx_ring_empty);
if (rds_ib_ring_low(&ic->i_recv_ring)) {
- rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
+ rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN);
rds_ib_stats_inc(s_ib_rx_refill_from_cq);
}
}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index dfe778220657..92b4a8689aae 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -638,6 +638,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
send->s_sge[0].length = sizeof(struct rds_header);
send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev,
+ ic->i_send_hdrs_dma[pos],
+ sizeof(struct rds_header),
+ DMA_TO_DEVICE);
memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
sizeof(struct rds_header));
@@ -688,6 +692,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
adv_credits = 0;
rds_ib_stats_inc(s_ib_tx_credit_updates);
}
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev,
+ ic->i_send_hdrs_dma[pos],
+ sizeof(struct rds_header),
+ DMA_TO_DEVICE);
if (prev)
prev->s_wr.next = &send->s_wr;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index ccdd304eae0a..6f1a50d50d06 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -269,7 +269,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out;
} else {
nents = ret;
- sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+ sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL);
if (!sg) {
ret = -ENOMEM;
goto out;
@@ -565,6 +565,9 @@ int rds_rdma_extra_size(struct rds_rdma_args *args,
if (args->nr_local == 0)
return -EINVAL;
+ if (args->nr_local > UIO_MAXIOV)
+ return -EMSGSIZE;
+
iov->iov = kcalloc(args->nr_local,
sizeof(struct rds_iovec),
GFP_KERNEL);
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 971c73c7d34c..68d6ef9e59fc 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -40,6 +40,7 @@ struct rfkill {
enum rfkill_type type;
unsigned long state;
+ unsigned long hard_block_reasons;
u32 idx;
@@ -265,6 +266,7 @@ static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
ev->hard = !!(rfkill->state & RFKILL_BLOCK_HW);
ev->soft = !!(rfkill->state & (RFKILL_BLOCK_SW |
RFKILL_BLOCK_SW_PREV));
+ ev->hard_block_reasons = rfkill->hard_block_reasons;
spin_unlock_irqrestore(&rfkill->lock, flags);
}
@@ -522,19 +524,29 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type)
}
#endif
-bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
+bool rfkill_set_hw_state_reason(struct rfkill *rfkill,
+ bool blocked, unsigned long reason)
{
unsigned long flags;
bool ret, prev;
BUG_ON(!rfkill);
+ if (WARN(reason &
+ ~(RFKILL_HARD_BLOCK_SIGNAL | RFKILL_HARD_BLOCK_NOT_OWNER),
+ "hw_state reason not supported: 0x%lx", reason))
+ return blocked;
+
spin_lock_irqsave(&rfkill->lock, flags);
- prev = !!(rfkill->state & RFKILL_BLOCK_HW);
- if (blocked)
+ prev = !!(rfkill->hard_block_reasons & reason);
+ if (blocked) {
rfkill->state |= RFKILL_BLOCK_HW;
- else
- rfkill->state &= ~RFKILL_BLOCK_HW;
+ rfkill->hard_block_reasons |= reason;
+ } else {
+ rfkill->hard_block_reasons &= ~reason;
+ if (!rfkill->hard_block_reasons)
+ rfkill->state &= ~RFKILL_BLOCK_HW;
+ }
ret = !!(rfkill->state & RFKILL_BLOCK_ANY);
spin_unlock_irqrestore(&rfkill->lock, flags);
@@ -546,7 +558,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
return ret;
}
-EXPORT_SYMBOL(rfkill_set_hw_state);
+EXPORT_SYMBOL(rfkill_set_hw_state_reason);
static void __rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
{
@@ -744,6 +756,16 @@ static ssize_t soft_store(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(soft);
+static ssize_t hard_block_reasons_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+
+ return sprintf(buf, "0x%lx\n", rfkill->hard_block_reasons);
+}
+static DEVICE_ATTR_RO(hard_block_reasons);
+
static u8 user_state_from_blocked(unsigned long state)
{
if (state & RFKILL_BLOCK_HW)
@@ -796,6 +818,7 @@ static struct attribute *rfkill_dev_attrs[] = {
&dev_attr_state.attr,
&dev_attr_soft.attr,
&dev_attr_hard.attr,
+ &dev_attr_hard_block_reasons.attr,
NULL,
};
ATTRIBUTE_GROUPS(rfkill_dev);
@@ -811,6 +834,7 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
{
struct rfkill *rfkill = to_rfkill(dev);
unsigned long flags;
+ unsigned long reasons;
u32 state;
int error;
@@ -823,10 +847,13 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
return error;
spin_lock_irqsave(&rfkill->lock, flags);
state = rfkill->state;
+ reasons = rfkill->hard_block_reasons;
spin_unlock_irqrestore(&rfkill->lock, flags);
error = add_uevent_var(env, "RFKILL_STATE=%d",
user_state_from_blocked(state));
- return error;
+ if (error)
+ return error;
+ return add_uevent_var(env, "RFKILL_HW_BLOCK_REASON=0x%lx", reasons);
}
void rfkill_pause_polling(struct rfkill *rfkill)
@@ -876,6 +903,9 @@ static int rfkill_resume(struct device *dev)
rfkill->suspended = false;
+ if (!rfkill->registered)
+ return 0;
+
if (!rfkill->persistent) {
cur = !!(rfkill->state & RFKILL_BLOCK_SW);
rfkill_set_block(rfkill, cur);
diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c
index 7b094275ea8b..11c45c8c6c16 100644
--- a/net/rose/rose_loopback.c
+++ b/net/rose/rose_loopback.c
@@ -96,10 +96,19 @@ static void rose_loopback_timer(struct timer_list *unused)
}
if (frametype == ROSE_CALL_REQUEST) {
- if ((dev = rose_dev_get(dest)) != NULL) {
- if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0)
- kfree_skb(skb);
- } else {
+ if (!rose_loopback_neigh->dev) {
+ kfree_skb(skb);
+ continue;
+ }
+
+ dev = rose_dev_get(dest);
+ if (!dev) {
+ kfree_skb(skb);
+ continue;
+ }
+
+ if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) {
+ dev_put(dev);
kfree_skb(skb);
}
} else {
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index ddd0f95713a9..b11281bed2a4 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -28,6 +28,7 @@ rxrpc-y := \
rtt.o \
security.o \
sendmsg.o \
+ server_key.o \
skbuff.o \
utils.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 186c8a889b16..41671af6b33f 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -308,9 +308,10 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
key = NULL; /* a no-security key */
memset(&p, 0, sizeof(p));
- p.user_call_ID = user_call_ID;
- p.tx_total_len = tx_total_len;
- p.interruptibility = interruptibility;
+ p.user_call_ID = user_call_ID;
+ p.tx_total_len = tx_total_len;
+ p.interruptibility = interruptibility;
+ p.kernel = true;
memset(&cp, 0, sizeof(cp));
cp.local = rx->local;
@@ -989,7 +990,7 @@ static int __init af_rxrpc_init(void)
goto error_security;
}
- ret = register_pernet_subsys(&rxrpc_net_ops);
+ ret = register_pernet_device(&rxrpc_net_ops);
if (ret)
goto error_pernet;
@@ -1034,7 +1035,7 @@ error_key_type:
error_sock:
proto_unregister(&rxrpc_proto);
error_proto:
- unregister_pernet_subsys(&rxrpc_net_ops);
+ unregister_pernet_device(&rxrpc_net_ops);
error_pernet:
rxrpc_exit_security();
error_security:
@@ -1056,7 +1057,7 @@ static void __exit af_rxrpc_exit(void)
unregister_key_type(&key_type_rxrpc);
sock_unregister(PF_RXRPC);
proto_unregister(&rxrpc_proto);
- unregister_pernet_subsys(&rxrpc_net_ops);
+ unregister_pernet_device(&rxrpc_net_ops);
ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0);
ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 97aebb5d19db..7bd6f8a66a3e 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -12,6 +12,7 @@
#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
+#include <keys/rxrpc-type.h>
#include "protocol.h"
#if 0
@@ -34,6 +35,7 @@ struct rxrpc_crypt {
#define rxrpc_queue_delayed_work(WS,D) \
queue_delayed_work(rxrpc_workqueue, (WS), (D))
+struct key_preparsed_payload;
struct rxrpc_connection;
/*
@@ -76,14 +78,12 @@ struct rxrpc_net {
struct work_struct service_conn_reaper;
struct timer_list service_conn_reap_timer;
- unsigned int nr_client_conns;
- unsigned int nr_active_client_conns;
- bool kill_all_client_conns;
bool live;
+
+ bool kill_all_client_conns;
+ atomic_t nr_client_conns;
spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */
spinlock_t client_conn_discard_lock; /* Prevent multiple discarders */
- struct list_head waiting_client_conns;
- struct list_head active_client_conns;
struct list_head idle_client_conns;
struct work_struct client_conn_reaper;
struct timer_list client_conn_reap_timer;
@@ -218,17 +218,30 @@ struct rxrpc_security {
/* Clean up a security service */
void (*exit)(void);
+ /* Parse the information from a server key */
+ int (*preparse_server_key)(struct key_preparsed_payload *);
+
+ /* Clean up the preparse buffer after parsing a server key */
+ void (*free_preparse_server_key)(struct key_preparsed_payload *);
+
+ /* Destroy the payload of a server key */
+ void (*destroy_server_key)(struct key *);
+
+ /* Describe a server key */
+ void (*describe_server_key)(const struct key *, struct seq_file *);
+
/* initialise a connection's security */
- int (*init_connection_security)(struct rxrpc_connection *);
+ int (*init_connection_security)(struct rxrpc_connection *,
+ struct rxrpc_key_token *);
- /* prime a connection's packet security */
- int (*prime_packet_security)(struct rxrpc_connection *);
+ /* Work out how much data we can store in a packet, given an estimate
+ * of the amount of data remaining.
+ */
+ int (*how_much_data)(struct rxrpc_call *, size_t,
+ size_t *, size_t *, size_t *);
/* impose security on a packet */
- int (*secure_packet)(struct rxrpc_call *,
- struct sk_buff *,
- size_t,
- void *);
+ int (*secure_packet)(struct rxrpc_call *, struct sk_buff *, size_t);
/* verify the security on a received packet */
int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
@@ -275,8 +288,8 @@ struct rxrpc_local {
struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
struct sk_buff_head reject_queue; /* packets awaiting rejection */
struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */
- struct rb_root client_conns; /* Client connections by socket params */
- spinlock_t client_conns_lock; /* Lock for client_conns */
+ struct rb_root client_bundles; /* Client connection bundles by socket params */
+ spinlock_t client_bundles_lock; /* Lock for client_bundles */
spinlock_t lock; /* access lock */
rwlock_t services_lock; /* lock for services list */
int debug_id; /* debug ID for printks */
@@ -353,10 +366,7 @@ struct rxrpc_conn_parameters {
enum rxrpc_conn_flag {
RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */
RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */
- RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */
- RXRPC_CONN_EXPOSED, /* Conn has extra ref for exposure */
RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */
- RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */
RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */
RXRPC_CONN_FINAL_ACK_0, /* Need final ACK for channel 0 */
RXRPC_CONN_FINAL_ACK_1, /* Need final ACK for channel 1 */
@@ -377,19 +387,6 @@ enum rxrpc_conn_event {
};
/*
- * The connection cache state.
- */
-enum rxrpc_conn_cache_state {
- RXRPC_CONN_CLIENT_INACTIVE, /* Conn is not yet listed */
- RXRPC_CONN_CLIENT_WAITING, /* Conn is on wait list, waiting for capacity */
- RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */
- RXRPC_CONN_CLIENT_UPGRADE, /* Conn is on active list, probing for upgrade */
- RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */
- RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */
- RXRPC_CONN__NR_CACHE_STATES
-};
-
-/*
* The connection protocol state.
*/
enum rxrpc_conn_proto_state {
@@ -405,6 +402,23 @@ enum rxrpc_conn_proto_state {
};
/*
+ * RxRPC client connection bundle.
+ */
+struct rxrpc_bundle {
+ struct rxrpc_conn_parameters params;
+ atomic_t usage;
+ unsigned int debug_id;
+ bool try_upgrade; /* True if the bundle is attempting upgrade */
+ bool alloc_conn; /* True if someone's getting a conn */
+ short alloc_error; /* Error from last conn allocation */
+ spinlock_t channel_lock;
+ struct rb_node local_node; /* Node in local->client_conns */
+ struct list_head waiting_calls; /* Calls waiting for channels */
+ unsigned long avail_chans; /* Mask of available channels */
+ struct rxrpc_connection *conns[4]; /* The connections in the bundle (max 4) */
+};
+
+/*
* RxRPC connection definition
* - matched by { local, peer, epoch, conn_id, direction }
* - each connection can only handle four simultaneous calls
@@ -417,10 +431,7 @@ struct rxrpc_connection {
struct rcu_head rcu;
struct list_head cache_link;
- spinlock_t channel_lock;
- unsigned char active_chans; /* Mask of active channels */
-#define RXRPC_ACTIVE_CHANS_MASK ((1 << RXRPC_MAXCALLS) - 1)
- struct list_head waiting_calls; /* Calls waiting for channels */
+ unsigned char act_chans; /* Mask of active channels */
struct rxrpc_channel {
unsigned long final_ack_at; /* Time at which to issue final ACK */
struct rxrpc_call __rcu *call; /* Active call */
@@ -437,33 +448,33 @@ struct rxrpc_connection {
struct timer_list timer; /* Conn event timer */
struct work_struct processor; /* connection event processor */
- union {
- struct rb_node client_node; /* Node in local->client_conns */
- struct rb_node service_node; /* Node in peer->service_conns */
- };
+ struct rxrpc_bundle *bundle; /* Client connection bundle */
+ struct rb_node service_node; /* Node in peer->service_conns */
struct list_head proc_link; /* link in procfs list */
struct list_head link; /* link in master connection list */
struct sk_buff_head rx_queue; /* received conn-level packets */
+
const struct rxrpc_security *security; /* applied security module */
- struct key *server_key; /* security for this service */
- struct crypto_sync_skcipher *cipher; /* encryption handle */
- struct rxrpc_crypt csum_iv; /* packet checksum base */
+ union {
+ struct {
+ struct crypto_sync_skcipher *cipher; /* encryption handle */
+ struct rxrpc_crypt csum_iv; /* packet checksum base */
+ u32 nonce; /* response re-use preventer */
+ } rxkad;
+ };
unsigned long flags;
unsigned long events;
unsigned long idle_timestamp; /* Time at which last became idle */
spinlock_t state_lock; /* state-change lock */
- enum rxrpc_conn_cache_state cache_state;
enum rxrpc_conn_proto_state state; /* current state of connection */
u32 abort_code; /* Abort code of connection abort */
int debug_id; /* debug ID for printks */
atomic_t serial; /* packet serial number counter */
unsigned int hi_serial; /* highest serial number received */
- u32 security_nonce; /* response re-use preventer */
u32 service_id; /* Service ID, possibly upgraded */
- u8 size_align; /* data size alignment (for security) */
- u8 security_size; /* security header size */
u8 security_ix; /* security type */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
+ u8 bundle_shift; /* Index into bundle->avail_chans */
short error; /* Local error code */
};
@@ -493,6 +504,8 @@ enum rxrpc_call_flag {
RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */
RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */
+ RXRPC_CALL_KERNEL, /* The call was made by the kernel */
+ RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */
};
/*
@@ -576,7 +589,7 @@ struct rxrpc_call {
struct work_struct processor; /* Event processor */
rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */
struct list_head link; /* link in master call list */
- struct list_head chan_wait_link; /* Link in conn->waiting_calls */
+ struct list_head chan_wait_link; /* Link in conn->bundle->waiting_calls */
struct hlist_node error_link; /* link in error distribution list */
struct list_head accept_link; /* Link in rx->acceptq */
struct list_head recvmsg_link; /* Link in rx->recvmsg_q */
@@ -726,6 +739,7 @@ struct rxrpc_call_params {
u32 normal; /* Max time since last call packet (msec) */
} timeouts;
u8 nr_timeouts; /* Number of timeouts specified */
+ bool kernel; /* T if kernel is making the call */
enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */
};
@@ -812,18 +826,19 @@ static inline bool rxrpc_is_client_call(const struct rxrpc_call *call)
/*
* conn_client.c
*/
-extern unsigned int rxrpc_max_client_connections;
extern unsigned int rxrpc_reap_client_connections;
extern unsigned long rxrpc_conn_idle_client_expiry;
extern unsigned long rxrpc_conn_idle_client_fast_expiry;
extern struct idr rxrpc_client_conn_ids;
void rxrpc_destroy_client_conn_ids(void);
+struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *);
+void rxrpc_put_bundle(struct rxrpc_bundle *);
int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *,
struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *,
gfp_t);
void rxrpc_expose_client_call(struct rxrpc_call *);
-void rxrpc_disconnect_client_call(struct rxrpc_call *);
+void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *);
void rxrpc_put_client_conn(struct rxrpc_connection *);
void rxrpc_discard_expired_client_conns(struct work_struct *);
void rxrpc_destroy_all_client_connections(struct rxrpc_net *);
@@ -833,6 +848,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *);
* conn_event.c
*/
void rxrpc_process_connection(struct work_struct *);
+void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool);
/*
* conn_object.c
@@ -849,7 +865,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *);
void rxrpc_kill_connection(struct rxrpc_connection *);
bool rxrpc_queue_conn(struct rxrpc_connection *);
void rxrpc_see_connection(struct rxrpc_connection *);
-void rxrpc_get_connection(struct rxrpc_connection *);
+struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *);
struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *);
void rxrpc_put_service_conn(struct rxrpc_connection *);
void rxrpc_service_connection_reaper(struct work_struct *);
@@ -889,8 +905,7 @@ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
struct sk_buff *);
struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t);
void rxrpc_new_incoming_connection(struct rxrpc_sock *, struct rxrpc_connection *,
- const struct rxrpc_security *, struct key *,
- struct sk_buff *);
+ const struct rxrpc_security *, struct sk_buff *);
void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
/*
@@ -907,10 +922,8 @@ extern const struct rxrpc_security rxrpc_no_security;
* key.c
*/
extern struct key_type key_type_rxrpc;
-extern struct key_type key_type_rxrpc_s;
int rxrpc_request_key(struct rxrpc_sock *, sockptr_t , int);
-int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int);
int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t,
u32);
@@ -1053,11 +1066,13 @@ extern const struct rxrpc_security rxkad;
* security.c
*/
int __init rxrpc_init_security(void);
+const struct rxrpc_security *rxrpc_security_lookup(u8);
void rxrpc_exit_security(void);
int rxrpc_init_client_conn_security(struct rxrpc_connection *);
-bool rxrpc_look_up_server_security(struct rxrpc_local *, struct rxrpc_sock *,
- const struct rxrpc_security **, struct key **,
- struct sk_buff *);
+const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *,
+ struct sk_buff *);
+struct key *rxrpc_look_up_server_security(struct rxrpc_connection *,
+ struct sk_buff *, u32, u32);
/*
* sendmsg.c
@@ -1065,6 +1080,13 @@ bool rxrpc_look_up_server_security(struct rxrpc_local *, struct rxrpc_sock *,
int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
/*
+ * server_key.c
+ */
+extern struct key_type key_type_rxrpc_s;
+
+int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int);
+
+/*
* skbuff.c
*/
void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 8df1964db333..1ae90fb97936 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -197,6 +197,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
tail = b->peer_backlog_tail;
while (CIRC_CNT(head, tail, size) > 0) {
struct rxrpc_peer *peer = b->peer_backlog[tail];
+ rxrpc_put_local(peer->local);
kfree(peer);
tail = (tail + 1) & (size - 1);
}
@@ -261,7 +262,6 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
struct rxrpc_peer *peer,
struct rxrpc_connection *conn,
const struct rxrpc_security *sec,
- struct key *key,
struct sk_buff *skb)
{
struct rxrpc_backlog *b = rx->backlog;
@@ -309,7 +309,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
conn->params.local = rxrpc_get_local(local);
conn->params.peer = peer;
rxrpc_see_connection(conn);
- rxrpc_new_incoming_connection(rx, conn, sec, key, skb);
+ rxrpc_new_incoming_connection(rx, conn, sec, skb);
} else {
rxrpc_get_connection(conn);
}
@@ -353,7 +353,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
struct rxrpc_connection *conn;
struct rxrpc_peer *peer = NULL;
struct rxrpc_call *call = NULL;
- struct key *key = NULL;
_enter("");
@@ -374,11 +373,13 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
*/
conn = rxrpc_find_connection_rcu(local, skb, &peer);
- if (!conn && !rxrpc_look_up_server_security(local, rx, &sec, &key, skb))
- goto no_call;
+ if (!conn) {
+ sec = rxrpc_get_incoming_security(rx, skb);
+ if (!sec)
+ goto no_call;
+ }
- call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, key, skb);
- key_put(key);
+ call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, skb);
if (!call) {
skb->mark = RXRPC_SKB_MARK_REJECT_BUSY;
goto no_call;
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index ed49769b459d..4eb91d958a48 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -40,6 +40,11 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
struct kmem_cache *rxrpc_call_jar;
+static struct semaphore rxrpc_call_limiter =
+ __SEMAPHORE_INITIALIZER(rxrpc_call_limiter, 1000);
+static struct semaphore rxrpc_kernel_call_limiter =
+ __SEMAPHORE_INITIALIZER(rxrpc_kernel_call_limiter, 1000);
+
static void rxrpc_call_timer_expired(struct timer_list *t)
{
struct rxrpc_call *call = from_timer(call, t, timer);
@@ -209,6 +214,34 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
}
/*
+ * Wait for a call slot to become available.
+ */
+static struct semaphore *rxrpc_get_call_slot(struct rxrpc_call_params *p, gfp_t gfp)
+{
+ struct semaphore *limiter = &rxrpc_call_limiter;
+
+ if (p->kernel)
+ limiter = &rxrpc_kernel_call_limiter;
+ if (p->interruptibility == RXRPC_UNINTERRUPTIBLE) {
+ down(limiter);
+ return limiter;
+ }
+ return down_interruptible(limiter) < 0 ? NULL : limiter;
+}
+
+/*
+ * Release a call slot.
+ */
+static void rxrpc_put_call_slot(struct rxrpc_call *call)
+{
+ struct semaphore *limiter = &rxrpc_call_limiter;
+
+ if (test_bit(RXRPC_CALL_KERNEL, &call->flags))
+ limiter = &rxrpc_kernel_call_limiter;
+ up(limiter);
+}
+
+/*
* Set up a call for the given parameters.
* - Called with the socket lock held, which it must release.
* - If it returns a call, the call's lock will need releasing by the caller.
@@ -224,15 +257,21 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
{
struct rxrpc_call *call, *xcall;
struct rxrpc_net *rxnet;
+ struct semaphore *limiter;
struct rb_node *parent, **pp;
const void *here = __builtin_return_address(0);
int ret;
_enter("%p,%lx", rx, p->user_call_ID);
+ limiter = rxrpc_get_call_slot(p, gfp);
+ if (!limiter)
+ return ERR_PTR(-ERESTARTSYS);
+
call = rxrpc_alloc_client_call(rx, srx, gfp, debug_id);
if (IS_ERR(call)) {
release_sock(&rx->sk);
+ up(limiter);
_leave(" = %ld", PTR_ERR(call));
return call;
}
@@ -242,6 +281,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
trace_rxrpc_call(call->debug_id, rxrpc_call_new_client,
atomic_read(&call->usage),
here, (const void *)p->user_call_ID);
+ if (p->kernel)
+ __set_bit(RXRPC_CALL_KERNEL, &call->flags);
/* We need to protect a partially set up call against the user as we
* will be acting outside the socket lock.
@@ -468,6 +509,8 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
BUG();
spin_unlock_bh(&call->lock);
+ rxrpc_put_call_slot(call);
+
del_timer_sync(&call->timer);
/* Make sure we don't get any more notifications */
@@ -505,8 +548,6 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
rxrpc_disconnect_call(call);
if (call->security)
call->security->free_call_crypto(call);
-
- rxrpc_cleanup_ring(call);
_leave("");
}
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 159e3eda7914..dbea0bfee48e 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -1,63 +1,15 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Client connection-specific management code.
*
- * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2016, 2020 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* Client connections need to be cached for a little while after they've made a
* call so as to handle retransmitted DATA packets in case the server didn't
* receive the final ACK or terminating ABORT we sent it.
*
- * Client connections can be in one of a number of cache states:
- *
- * (1) INACTIVE - The connection is not held in any list and may not have been
- * exposed to the world. If it has been previously exposed, it was
- * discarded from the idle list after expiring.
- *
- * (2) WAITING - The connection is waiting for the number of client conns to
- * drop below the maximum capacity. Calls may be in progress upon it from
- * when it was active and got culled.
- *
- * The connection is on the rxrpc_waiting_client_conns list which is kept
- * in to-be-granted order. Culled conns with waiters go to the back of
- * the queue just like new conns.
- *
- * (3) ACTIVE - The connection has at least one call in progress upon it, it
- * may freely grant available channels to new calls and calls may be
- * waiting on it for channels to become available.
- *
- * The connection is on the rxnet->active_client_conns list which is kept
- * in activation order for culling purposes.
- *
- * rxrpc_nr_active_client_conns is held incremented also.
- *
- * (4) UPGRADE - As for ACTIVE, but only one call may be in progress and is
- * being used to probe for service upgrade.
- *
- * (5) CULLED - The connection got summarily culled to try and free up
- * capacity. Calls currently in progress on the connection are allowed to
- * continue, but new calls will have to wait. There can be no waiters in
- * this state - the conn would have to go to the WAITING state instead.
- *
- * (6) IDLE - The connection has no calls in progress upon it and must have
- * been exposed to the world (ie. the EXPOSED flag must be set). When it
- * expires, the EXPOSED flag is cleared and the connection transitions to
- * the INACTIVE state.
- *
- * The connection is on the rxnet->idle_client_conns list which is kept in
- * order of how soon they'll expire.
- *
* There are flags of relevance to the cache:
*
- * (1) EXPOSED - The connection ID got exposed to the world. If this flag is
- * set, an extra ref is added to the connection preventing it from being
- * reaped when it has no calls outstanding. This flag is cleared and the
- * ref dropped when a conn is discarded from the idle list.
- *
- * This allows us to move terminal call state retransmission to the
- * connection and to discard the call immediately we think it is done
- * with. It also give us a chance to reuse the connection.
- *
* (2) DONT_REUSE - The connection should be discarded as soon as possible and
* should not be reused. This is set when an exclusive connection is used
* or a call ID counter overflows.
@@ -78,7 +30,6 @@
#include "ar-internal.h"
-__read_mostly unsigned int rxrpc_max_client_connections = 1000;
__read_mostly unsigned int rxrpc_reap_client_connections = 900;
__read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ;
__read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
@@ -89,8 +40,6 @@ __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
DEFINE_IDR(rxrpc_client_conn_ids);
static DEFINE_SPINLOCK(rxrpc_conn_id_lock);
-static void rxrpc_cull_active_client_conns(struct rxrpc_net *);
-
/*
* Get a connection ID and epoch for a client connection from the global pool.
* The connection struct pointer is then recorded in the idr radix tree. The
@@ -162,13 +111,50 @@ void rxrpc_destroy_client_conn_ids(void)
}
/*
+ * Allocate a connection bundle.
+ */
+static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp,
+ gfp_t gfp)
+{
+ struct rxrpc_bundle *bundle;
+
+ bundle = kzalloc(sizeof(*bundle), gfp);
+ if (bundle) {
+ bundle->params = *cp;
+ rxrpc_get_peer(bundle->params.peer);
+ atomic_set(&bundle->usage, 1);
+ spin_lock_init(&bundle->channel_lock);
+ INIT_LIST_HEAD(&bundle->waiting_calls);
+ }
+ return bundle;
+}
+
+struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle)
+{
+ atomic_inc(&bundle->usage);
+ return bundle;
+}
+
+void rxrpc_put_bundle(struct rxrpc_bundle *bundle)
+{
+ unsigned int d = bundle->debug_id;
+ unsigned int u = atomic_dec_return(&bundle->usage);
+
+ _debug("PUT B=%x %u", d, u);
+ if (u == 0) {
+ rxrpc_put_peer(bundle->params.peer);
+ kfree(bundle);
+ }
+}
+
+/*
* Allocate a client connection.
*/
static struct rxrpc_connection *
-rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
+rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp)
{
struct rxrpc_connection *conn;
- struct rxrpc_net *rxnet = cp->local->rxnet;
+ struct rxrpc_net *rxnet = bundle->params.local->rxnet;
int ret;
_enter("");
@@ -180,15 +166,11 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
}
atomic_set(&conn->usage, 1);
- if (cp->exclusive)
- __set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
- if (cp->upgrade)
- __set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags);
-
- conn->params = *cp;
+ conn->bundle = bundle;
+ conn->params = bundle->params;
conn->out_clientflag = RXRPC_CLIENT_INITIATED;
conn->state = RXRPC_CONN_CLIENT;
- conn->service_id = cp->service_id;
+ conn->service_id = conn->params.service_id;
ret = rxrpc_get_client_connection_id(conn, gfp);
if (ret < 0)
@@ -198,29 +180,25 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
if (ret < 0)
goto error_1;
- ret = conn->security->prime_packet_security(conn);
- if (ret < 0)
- goto error_2;
-
atomic_inc(&rxnet->nr_conns);
write_lock(&rxnet->conn_lock);
list_add_tail(&conn->proc_link, &rxnet->conn_proc_list);
write_unlock(&rxnet->conn_lock);
- /* We steal the caller's peer ref. */
- cp->peer = NULL;
+ rxrpc_get_bundle(bundle);
+ rxrpc_get_peer(conn->params.peer);
rxrpc_get_local(conn->params.local);
key_get(conn->params.key);
trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client,
atomic_read(&conn->usage),
__builtin_return_address(0));
+
+ atomic_inc(&rxnet->nr_client_conns);
trace_rxrpc_client(conn, -1, rxrpc_client_alloc);
_leave(" = %p", conn);
return conn;
-error_2:
- conn->security->clear(conn);
error_1:
rxrpc_put_client_connection_id(conn);
error_0:
@@ -234,13 +212,18 @@ error_0:
*/
static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
{
- struct rxrpc_net *rxnet = conn->params.local->rxnet;
+ struct rxrpc_net *rxnet;
int id_cursor, id, distance, limit;
+ if (!conn)
+ goto dont_reuse;
+
+ rxnet = conn->params.local->rxnet;
if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags))
goto dont_reuse;
- if (conn->proto.epoch != rxnet->epoch)
+ if (conn->state != RXRPC_CONN_CLIENT ||
+ conn->proto.epoch != rxnet->epoch)
goto mark_dont_reuse;
/* The IDR tree gets very expensive on memory if the connection IDs are
@@ -254,7 +237,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
distance = id - id_cursor;
if (distance < 0)
distance = -distance;
- limit = max(rxrpc_max_client_connections * 4, 1024U);
+ limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024);
if (distance > limit)
goto mark_dont_reuse;
@@ -267,277 +250,247 @@ dont_reuse:
}
/*
- * Create or find a client connection to use for a call.
- *
- * If we return with a connection, the call will be on its waiting list. It's
- * left to the caller to assign a channel and wake up the call.
+ * Look up the conn bundle that matches the connection parameters, adding it if
+ * it doesn't yet exist.
*/
-static int rxrpc_get_client_conn(struct rxrpc_sock *rx,
- struct rxrpc_call *call,
- struct rxrpc_conn_parameters *cp,
- struct sockaddr_rxrpc *srx,
- gfp_t gfp)
+static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *cp,
+ gfp_t gfp)
{
- struct rxrpc_connection *conn, *candidate = NULL;
+ static atomic_t rxrpc_bundle_id;
+ struct rxrpc_bundle *bundle, *candidate;
struct rxrpc_local *local = cp->local;
struct rb_node *p, **pp, *parent;
long diff;
- int ret = -ENOMEM;
- _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
+ _enter("{%px,%x,%u,%u}",
+ cp->peer, key_serial(cp->key), cp->security_level, cp->upgrade);
- cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp);
- if (!cp->peer)
- goto error;
+ if (cp->exclusive)
+ return rxrpc_alloc_bundle(cp, gfp);
- call->cong_cwnd = cp->peer->cong_cwnd;
- if (call->cong_cwnd >= call->cong_ssthresh)
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
- else
- call->cong_mode = RXRPC_CALL_SLOW_START;
+ /* First, see if the bundle is already there. */
+ _debug("search 1");
+ spin_lock(&local->client_bundles_lock);
+ p = local->client_bundles.rb_node;
+ while (p) {
+ bundle = rb_entry(p, struct rxrpc_bundle, local_node);
- /* If the connection is not meant to be exclusive, search the available
- * connections to see if the connection we want to use already exists.
- */
- if (!cp->exclusive) {
- _debug("search 1");
- spin_lock(&local->client_conns_lock);
- p = local->client_conns.rb_node;
- while (p) {
- conn = rb_entry(p, struct rxrpc_connection, client_node);
-
-#define cmp(X) ((long)conn->params.X - (long)cp->X)
- diff = (cmp(peer) ?:
- cmp(key) ?:
- cmp(security_level) ?:
- cmp(upgrade));
+#define cmp(X) ((long)bundle->params.X - (long)cp->X)
+ diff = (cmp(peer) ?:
+ cmp(key) ?:
+ cmp(security_level) ?:
+ cmp(upgrade));
#undef cmp
- if (diff < 0) {
- p = p->rb_left;
- } else if (diff > 0) {
- p = p->rb_right;
- } else {
- if (rxrpc_may_reuse_conn(conn) &&
- rxrpc_get_connection_maybe(conn))
- goto found_extant_conn;
- /* The connection needs replacing. It's better
- * to effect that when we have something to
- * replace it with so that we don't have to
- * rebalance the tree twice.
- */
- break;
- }
- }
- spin_unlock(&local->client_conns_lock);
+ if (diff < 0)
+ p = p->rb_left;
+ else if (diff > 0)
+ p = p->rb_right;
+ else
+ goto found_bundle;
}
+ spin_unlock(&local->client_bundles_lock);
+ _debug("not found");
- /* There wasn't a connection yet or we need an exclusive connection.
- * We need to create a candidate and then potentially redo the search
- * in case we're racing with another thread also trying to connect on a
- * shareable connection.
- */
- _debug("new conn");
- candidate = rxrpc_alloc_client_connection(cp, gfp);
- if (IS_ERR(candidate)) {
- ret = PTR_ERR(candidate);
- goto error_peer;
- }
-
- /* Add the call to the new connection's waiting list in case we're
- * going to have to wait for the connection to come live. It's our
- * connection, so we want first dibs on the channel slots. We would
- * normally have to take channel_lock but we do this before anyone else
- * can see the connection.
- */
- list_add(&call->chan_wait_link, &candidate->waiting_calls);
-
- if (cp->exclusive) {
- call->conn = candidate;
- call->security = candidate->security;
- call->security_ix = candidate->security_ix;
- call->service_id = candidate->service_id;
- _leave(" = 0 [exclusive %d]", candidate->debug_id);
- return 0;
- }
+ /* It wasn't. We need to add one. */
+ candidate = rxrpc_alloc_bundle(cp, gfp);
+ if (!candidate)
+ return NULL;
- /* Publish the new connection for userspace to find. We need to redo
- * the search before doing this lest we race with someone else adding a
- * conflicting instance.
- */
_debug("search 2");
- spin_lock(&local->client_conns_lock);
-
- pp = &local->client_conns.rb_node;
+ spin_lock(&local->client_bundles_lock);
+ pp = &local->client_bundles.rb_node;
parent = NULL;
while (*pp) {
parent = *pp;
- conn = rb_entry(parent, struct rxrpc_connection, client_node);
+ bundle = rb_entry(parent, struct rxrpc_bundle, local_node);
-#define cmp(X) ((long)conn->params.X - (long)candidate->params.X)
+#define cmp(X) ((long)bundle->params.X - (long)cp->X)
diff = (cmp(peer) ?:
cmp(key) ?:
cmp(security_level) ?:
cmp(upgrade));
#undef cmp
- if (diff < 0) {
+ if (diff < 0)
pp = &(*pp)->rb_left;
- } else if (diff > 0) {
+ else if (diff > 0)
pp = &(*pp)->rb_right;
- } else {
- if (rxrpc_may_reuse_conn(conn) &&
- rxrpc_get_connection_maybe(conn))
- goto found_extant_conn;
- /* The old connection is from an outdated epoch. */
- _debug("replace conn");
- clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags);
- rb_replace_node(&conn->client_node,
- &candidate->client_node,
- &local->client_conns);
- trace_rxrpc_client(conn, -1, rxrpc_client_replace);
- goto candidate_published;
- }
+ else
+ goto found_bundle_free;
}
- _debug("new conn");
- rb_link_node(&candidate->client_node, parent, pp);
- rb_insert_color(&candidate->client_node, &local->client_conns);
-
-candidate_published:
- set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
- call->conn = candidate;
- call->security = candidate->security;
- call->security_ix = candidate->security_ix;
- call->service_id = candidate->service_id;
- spin_unlock(&local->client_conns_lock);
- _leave(" = 0 [new %d]", candidate->debug_id);
- return 0;
+ _debug("new bundle");
+ candidate->debug_id = atomic_inc_return(&rxrpc_bundle_id);
+ rb_link_node(&candidate->local_node, parent, pp);
+ rb_insert_color(&candidate->local_node, &local->client_bundles);
+ rxrpc_get_bundle(candidate);
+ spin_unlock(&local->client_bundles_lock);
+ _leave(" = %u [new]", candidate->debug_id);
+ return candidate;
+
+found_bundle_free:
+ kfree(candidate);
+found_bundle:
+ rxrpc_get_bundle(bundle);
+ spin_unlock(&local->client_bundles_lock);
+ _leave(" = %u [found]", bundle->debug_id);
+ return bundle;
+}
- /* We come here if we found a suitable connection already in existence.
- * Discard any candidate we may have allocated, and try to get a
- * channel on this one.
- */
-found_extant_conn:
- _debug("found conn");
- spin_unlock(&local->client_conns_lock);
+/*
+ * Create or find a client bundle to use for a call.
+ *
+ * If we return with a connection, the call will be on its waiting list. It's
+ * left to the caller to assign a channel and wake up the call.
+ */
+static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct rxrpc_conn_parameters *cp,
+ struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
+{
+ struct rxrpc_bundle *bundle;
- if (candidate) {
- trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
- rxrpc_put_connection(candidate);
- candidate = NULL;
- }
+ _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
- spin_lock(&conn->channel_lock);
- call->conn = conn;
- call->security = conn->security;
- call->security_ix = conn->security_ix;
- call->service_id = conn->service_id;
- list_add_tail(&call->chan_wait_link, &conn->waiting_calls);
- spin_unlock(&conn->channel_lock);
- _leave(" = 0 [extant %d]", conn->debug_id);
- return 0;
+ cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp);
+ if (!cp->peer)
+ goto error;
+
+ call->cong_cwnd = cp->peer->cong_cwnd;
+ if (call->cong_cwnd >= call->cong_ssthresh)
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ else
+ call->cong_mode = RXRPC_CALL_SLOW_START;
+ if (cp->upgrade)
+ __set_bit(RXRPC_CALL_UPGRADE, &call->flags);
+
+ /* Find the client connection bundle. */
+ bundle = rxrpc_look_up_bundle(cp, gfp);
+ if (!bundle)
+ goto error;
+
+ /* Get this call queued. Someone else may activate it whilst we're
+ * lining up a new connection, but that's fine.
+ */
+ spin_lock(&bundle->channel_lock);
+ list_add_tail(&call->chan_wait_link, &bundle->waiting_calls);
+ spin_unlock(&bundle->channel_lock);
+
+ _leave(" = [B=%x]", bundle->debug_id);
+ return bundle;
-error_peer:
- rxrpc_put_peer(cp->peer);
- cp->peer = NULL;
error:
- _leave(" = %d", ret);
- return ret;
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
}
/*
- * Activate a connection.
+ * Allocate a new connection and add it into a bundle.
*/
-static void rxrpc_activate_conn(struct rxrpc_net *rxnet,
- struct rxrpc_connection *conn)
+static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp)
+ __releases(bundle->channel_lock)
{
- if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) {
- trace_rxrpc_client(conn, -1, rxrpc_client_to_upgrade);
- conn->cache_state = RXRPC_CONN_CLIENT_UPGRADE;
- } else {
- trace_rxrpc_client(conn, -1, rxrpc_client_to_active);
- conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE;
- }
- rxnet->nr_active_client_conns++;
- list_move_tail(&conn->cache_link, &rxnet->active_client_conns);
-}
+ struct rxrpc_connection *candidate = NULL, *old = NULL;
+ bool conflict;
+ int i;
-/*
- * Attempt to animate a connection for a new call.
- *
- * If it's not exclusive, the connection is in the endpoint tree, and we're in
- * the conn's list of those waiting to grab a channel. There is, however, a
- * limit on the number of live connections allowed at any one time, so we may
- * have to wait for capacity to become available.
- *
- * Note that a connection on the waiting queue might *also* have active
- * channels if it has been culled to make space and then re-requested by a new
- * call.
- */
-static void rxrpc_animate_client_conn(struct rxrpc_net *rxnet,
- struct rxrpc_connection *conn)
-{
- unsigned int nr_conns;
+ _enter("");
- _enter("%d,%d", conn->debug_id, conn->cache_state);
+ conflict = bundle->alloc_conn;
+ if (!conflict)
+ bundle->alloc_conn = true;
+ spin_unlock(&bundle->channel_lock);
+ if (conflict) {
+ _leave(" [conf]");
+ return;
+ }
- if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE ||
- conn->cache_state == RXRPC_CONN_CLIENT_UPGRADE)
- goto out;
+ candidate = rxrpc_alloc_client_connection(bundle, gfp);
- spin_lock(&rxnet->client_conn_cache_lock);
+ spin_lock(&bundle->channel_lock);
+ bundle->alloc_conn = false;
- nr_conns = rxnet->nr_client_conns;
- if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
- trace_rxrpc_client(conn, -1, rxrpc_client_count);
- rxnet->nr_client_conns = nr_conns + 1;
+ if (IS_ERR(candidate)) {
+ bundle->alloc_error = PTR_ERR(candidate);
+ spin_unlock(&bundle->channel_lock);
+ _leave(" [err %ld]", PTR_ERR(candidate));
+ return;
}
- switch (conn->cache_state) {
- case RXRPC_CONN_CLIENT_ACTIVE:
- case RXRPC_CONN_CLIENT_UPGRADE:
- case RXRPC_CONN_CLIENT_WAITING:
- break;
-
- case RXRPC_CONN_CLIENT_INACTIVE:
- case RXRPC_CONN_CLIENT_CULLED:
- case RXRPC_CONN_CLIENT_IDLE:
- if (nr_conns >= rxrpc_max_client_connections)
- goto wait_for_capacity;
- goto activate_conn;
+ bundle->alloc_error = 0;
+
+ for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) {
+ unsigned int shift = i * RXRPC_MAXCALLS;
+ int j;
+
+ old = bundle->conns[i];
+ if (!rxrpc_may_reuse_conn(old)) {
+ if (old)
+ trace_rxrpc_client(old, -1, rxrpc_client_replace);
+ candidate->bundle_shift = shift;
+ bundle->conns[i] = candidate;
+ for (j = 0; j < RXRPC_MAXCALLS; j++)
+ set_bit(shift + j, &bundle->avail_chans);
+ candidate = NULL;
+ break;
+ }
- default:
- BUG();
+ old = NULL;
}
-out_unlock:
- spin_unlock(&rxnet->client_conn_cache_lock);
-out:
- _leave(" [%d]", conn->cache_state);
- return;
+ spin_unlock(&bundle->channel_lock);
-activate_conn:
- _debug("activate");
- rxrpc_activate_conn(rxnet, conn);
- goto out_unlock;
-
-wait_for_capacity:
- _debug("wait");
- trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
- conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
- list_move_tail(&conn->cache_link, &rxnet->waiting_client_conns);
- goto out_unlock;
+ if (candidate) {
+ _debug("discard C=%x", candidate->debug_id);
+ trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
+ rxrpc_put_connection(candidate);
+ }
+
+ rxrpc_put_connection(old);
+ _leave("");
}
/*
- * Deactivate a channel.
+ * Add a connection to a bundle if there are no usable connections or we have
+ * connections waiting for extra capacity.
*/
-static void rxrpc_deactivate_one_channel(struct rxrpc_connection *conn,
- unsigned int channel)
+static void rxrpc_maybe_add_conn(struct rxrpc_bundle *bundle, gfp_t gfp)
{
- struct rxrpc_channel *chan = &conn->channels[channel];
+ struct rxrpc_call *call;
+ int i, usable;
- rcu_assign_pointer(chan->call, NULL);
- conn->active_chans &= ~(1 << channel);
+ _enter("");
+
+ spin_lock(&bundle->channel_lock);
+
+ /* See if there are any usable connections. */
+ usable = 0;
+ for (i = 0; i < ARRAY_SIZE(bundle->conns); i++)
+ if (rxrpc_may_reuse_conn(bundle->conns[i]))
+ usable++;
+
+ if (!usable && !list_empty(&bundle->waiting_calls)) {
+ call = list_first_entry(&bundle->waiting_calls,
+ struct rxrpc_call, chan_wait_link);
+ if (test_bit(RXRPC_CALL_UPGRADE, &call->flags))
+ bundle->try_upgrade = true;
+ }
+
+ if (!usable)
+ goto alloc_conn;
+
+ if (!bundle->avail_chans &&
+ !bundle->try_upgrade &&
+ !list_empty(&bundle->waiting_calls) &&
+ usable < ARRAY_SIZE(bundle->conns))
+ goto alloc_conn;
+
+ spin_unlock(&bundle->channel_lock);
+ _leave("");
+ return;
+
+alloc_conn:
+ return rxrpc_add_conn_to_bundle(bundle, gfp);
}
/*
@@ -549,35 +502,42 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
unsigned int channel)
{
struct rxrpc_channel *chan = &conn->channels[channel];
- struct rxrpc_call *call = list_entry(conn->waiting_calls.next,
+ struct rxrpc_bundle *bundle = conn->bundle;
+ struct rxrpc_call *call = list_entry(bundle->waiting_calls.next,
struct rxrpc_call, chan_wait_link);
u32 call_id = chan->call_counter + 1;
+ _enter("C=%x,%u", conn->debug_id, channel);
+
trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate);
/* Cancel the final ACK on the previous call if it hasn't been sent yet
* as the DATA packet will implicitly ACK it.
*/
clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags);
-
- write_lock_bh(&call->state_lock);
- call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
- write_unlock_bh(&call->state_lock);
+ clear_bit(conn->bundle_shift + channel, &bundle->avail_chans);
rxrpc_see_call(call);
list_del_init(&call->chan_wait_link);
- conn->active_chans |= 1 << channel;
call->peer = rxrpc_get_peer(conn->params.peer);
+ call->conn = rxrpc_get_connection(conn);
call->cid = conn->proto.cid | channel;
call->call_id = call_id;
+ call->security = conn->security;
+ call->security_ix = conn->security_ix;
+ call->service_id = conn->service_id;
trace_rxrpc_connect_call(call);
_net("CONNECT call %08x:%08x as call %d on conn %d",
call->cid, call->call_id, call->debug_id, conn->debug_id);
- /* Paired with the read barrier in rxrpc_wait_for_channel(). This
- * orders cid and epoch in the connection wrt to call_id without the
- * need to take the channel_lock.
+ write_lock_bh(&call->state_lock);
+ call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
+ write_unlock_bh(&call->state_lock);
+
+ /* Paired with the read barrier in rxrpc_connect_call(). This orders
+ * cid and epoch in the connection wrt to call_id without the need to
+ * take the channel_lock.
*
* We provisionally assign a callNumber at this point, but we don't
* confirm it until the call is about to be exposed.
@@ -586,101 +546,137 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
* at the call ID through a connection channel.
*/
smp_wmb();
- chan->call_id = call_id;
- chan->call_debug_id = call->debug_id;
+
+ chan->call_id = call_id;
+ chan->call_debug_id = call->debug_id;
rcu_assign_pointer(chan->call, call);
wake_up(&call->waitq);
}
/*
+ * Remove a connection from the idle list if it's on it.
+ */
+static void rxrpc_unidle_conn(struct rxrpc_bundle *bundle, struct rxrpc_connection *conn)
+{
+ struct rxrpc_net *rxnet = bundle->params.local->rxnet;
+ bool drop_ref;
+
+ if (!list_empty(&conn->cache_link)) {
+ drop_ref = false;
+ spin_lock(&rxnet->client_conn_cache_lock);
+ if (!list_empty(&conn->cache_link)) {
+ list_del_init(&conn->cache_link);
+ drop_ref = true;
+ }
+ spin_unlock(&rxnet->client_conn_cache_lock);
+ if (drop_ref)
+ rxrpc_put_connection(conn);
+ }
+}
+
+/*
* Assign channels and callNumbers to waiting calls with channel_lock
* held by caller.
*/
-static void rxrpc_activate_channels_locked(struct rxrpc_connection *conn)
+static void rxrpc_activate_channels_locked(struct rxrpc_bundle *bundle)
{
- u8 avail, mask;
-
- switch (conn->cache_state) {
- case RXRPC_CONN_CLIENT_ACTIVE:
- mask = RXRPC_ACTIVE_CHANS_MASK;
- break;
- case RXRPC_CONN_CLIENT_UPGRADE:
- mask = 0x01;
- break;
- default:
- return;
- }
+ struct rxrpc_connection *conn;
+ unsigned long avail, mask;
+ unsigned int channel, slot;
- while (!list_empty(&conn->waiting_calls) &&
- (avail = ~conn->active_chans,
- avail &= mask,
- avail != 0))
- rxrpc_activate_one_channel(conn, __ffs(avail));
+ if (bundle->try_upgrade)
+ mask = 1;
+ else
+ mask = ULONG_MAX;
+
+ while (!list_empty(&bundle->waiting_calls)) {
+ avail = bundle->avail_chans & mask;
+ if (!avail)
+ break;
+ channel = __ffs(avail);
+ clear_bit(channel, &bundle->avail_chans);
+
+ slot = channel / RXRPC_MAXCALLS;
+ conn = bundle->conns[slot];
+ if (!conn)
+ break;
+
+ if (bundle->try_upgrade)
+ set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags);
+ rxrpc_unidle_conn(bundle, conn);
+
+ channel &= (RXRPC_MAXCALLS - 1);
+ conn->act_chans |= 1 << channel;
+ rxrpc_activate_one_channel(conn, channel);
+ }
}
/*
* Assign channels and callNumbers to waiting calls.
*/
-static void rxrpc_activate_channels(struct rxrpc_connection *conn)
+static void rxrpc_activate_channels(struct rxrpc_bundle *bundle)
{
- _enter("%d", conn->debug_id);
+ _enter("B=%x", bundle->debug_id);
- trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans);
+ trace_rxrpc_client(NULL, -1, rxrpc_client_activate_chans);
- if (conn->active_chans == RXRPC_ACTIVE_CHANS_MASK)
+ if (!bundle->avail_chans)
return;
- spin_lock(&conn->channel_lock);
- rxrpc_activate_channels_locked(conn);
- spin_unlock(&conn->channel_lock);
+ spin_lock(&bundle->channel_lock);
+ rxrpc_activate_channels_locked(bundle);
+ spin_unlock(&bundle->channel_lock);
_leave("");
}
/*
* Wait for a callNumber and a channel to be granted to a call.
*/
-static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp)
+static int rxrpc_wait_for_channel(struct rxrpc_bundle *bundle,
+ struct rxrpc_call *call, gfp_t gfp)
{
+ DECLARE_WAITQUEUE(myself, current);
int ret = 0;
_enter("%d", call->debug_id);
- if (!call->call_id) {
- DECLARE_WAITQUEUE(myself, current);
+ if (!gfpflags_allow_blocking(gfp)) {
+ rxrpc_maybe_add_conn(bundle, gfp);
+ rxrpc_activate_channels(bundle);
+ ret = bundle->alloc_error ?: -EAGAIN;
+ goto out;
+ }
- if (!gfpflags_allow_blocking(gfp)) {
- ret = -EAGAIN;
- goto out;
+ add_wait_queue_exclusive(&call->waitq, &myself);
+ for (;;) {
+ rxrpc_maybe_add_conn(bundle, gfp);
+ rxrpc_activate_channels(bundle);
+ ret = bundle->alloc_error;
+ if (ret < 0)
+ break;
+
+ switch (call->interruptibility) {
+ case RXRPC_INTERRUPTIBLE:
+ case RXRPC_PREINTERRUPTIBLE:
+ set_current_state(TASK_INTERRUPTIBLE);
+ break;
+ case RXRPC_UNINTERRUPTIBLE:
+ default:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ break;
}
-
- add_wait_queue_exclusive(&call->waitq, &myself);
- for (;;) {
- switch (call->interruptibility) {
- case RXRPC_INTERRUPTIBLE:
- case RXRPC_PREINTERRUPTIBLE:
- set_current_state(TASK_INTERRUPTIBLE);
- break;
- case RXRPC_UNINTERRUPTIBLE:
- default:
- set_current_state(TASK_UNINTERRUPTIBLE);
- break;
- }
- if (call->call_id)
- break;
- if ((call->interruptibility == RXRPC_INTERRUPTIBLE ||
- call->interruptibility == RXRPC_PREINTERRUPTIBLE) &&
- signal_pending(current)) {
- ret = -ERESTARTSYS;
- break;
- }
- schedule();
+ if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_AWAIT_CONN)
+ break;
+ if ((call->interruptibility == RXRPC_INTERRUPTIBLE ||
+ call->interruptibility == RXRPC_PREINTERRUPTIBLE) &&
+ signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
}
- remove_wait_queue(&call->waitq, &myself);
- __set_current_state(TASK_RUNNING);
+ schedule();
}
-
- /* Paired with the write barrier in rxrpc_activate_one_channel(). */
- smp_rmb();
+ remove_wait_queue(&call->waitq, &myself);
+ __set_current_state(TASK_RUNNING);
out:
_leave(" = %d", ret);
@@ -697,52 +693,50 @@ int rxrpc_connect_call(struct rxrpc_sock *rx,
struct sockaddr_rxrpc *srx,
gfp_t gfp)
{
+ struct rxrpc_bundle *bundle;
struct rxrpc_net *rxnet = cp->local->rxnet;
- int ret;
+ int ret = 0;
_enter("{%d,%lx},", call->debug_id, call->user_call_ID);
rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper);
- rxrpc_cull_active_client_conns(rxnet);
- ret = rxrpc_get_client_conn(rx, call, cp, srx, gfp);
- if (ret < 0)
+ bundle = rxrpc_prep_call(rx, call, cp, srx, gfp);
+ if (IS_ERR(bundle)) {
+ ret = PTR_ERR(bundle);
goto out;
+ }
- rxrpc_animate_client_conn(rxnet, call->conn);
- rxrpc_activate_channels(call->conn);
-
- ret = rxrpc_wait_for_channel(call, gfp);
- if (ret < 0) {
- trace_rxrpc_client(call->conn, ret, rxrpc_client_chan_wait_failed);
- rxrpc_disconnect_client_call(call);
- goto out;
+ if (call->state == RXRPC_CALL_CLIENT_AWAIT_CONN) {
+ ret = rxrpc_wait_for_channel(bundle, call, gfp);
+ if (ret < 0)
+ goto wait_failed;
}
- spin_lock_bh(&call->conn->params.peer->lock);
- hlist_add_head_rcu(&call->error_link,
- &call->conn->params.peer->error_targets);
- spin_unlock_bh(&call->conn->params.peer->lock);
+granted_channel:
+ /* Paired with the write barrier in rxrpc_activate_one_channel(). */
+ smp_rmb();
+out_put_bundle:
+ rxrpc_put_bundle(bundle);
out:
_leave(" = %d", ret);
return ret;
-}
-/*
- * Note that a connection is about to be exposed to the world. Once it is
- * exposed, we maintain an extra ref on it that stops it from being summarily
- * discarded before it's (a) had a chance to deal with retransmission and (b)
- * had a chance at re-use (the per-connection security negotiation is
- * expensive).
- */
-static void rxrpc_expose_client_conn(struct rxrpc_connection *conn,
- unsigned int channel)
-{
- if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
- trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
- rxrpc_get_connection(conn);
+wait_failed:
+ spin_lock(&bundle->channel_lock);
+ list_del_init(&call->chan_wait_link);
+ spin_unlock(&bundle->channel_lock);
+
+ if (call->state != RXRPC_CALL_CLIENT_AWAIT_CONN) {
+ ret = 0;
+ goto granted_channel;
}
+
+ trace_rxrpc_client(call->conn, ret, rxrpc_client_chan_wait_failed);
+ rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret);
+ rxrpc_disconnect_client_call(bundle, call);
+ goto out_put_bundle;
}
/*
@@ -764,7 +758,7 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
chan->call_counter++;
if (chan->call_counter >= INT_MAX)
set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
- rxrpc_expose_client_conn(conn, channel);
+ trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
}
}
@@ -773,62 +767,56 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
*/
static void rxrpc_set_client_reap_timer(struct rxrpc_net *rxnet)
{
- unsigned long now = jiffies;
- unsigned long reap_at = now + rxrpc_conn_idle_client_expiry;
+ if (!rxnet->kill_all_client_conns) {
+ unsigned long now = jiffies;
+ unsigned long reap_at = now + rxrpc_conn_idle_client_expiry;
- if (rxnet->live)
- timer_reduce(&rxnet->client_conn_reap_timer, reap_at);
+ if (rxnet->live)
+ timer_reduce(&rxnet->client_conn_reap_timer, reap_at);
+ }
}
/*
* Disconnect a client call.
*/
-void rxrpc_disconnect_client_call(struct rxrpc_call *call)
+void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call *call)
{
- struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_connection *conn;
struct rxrpc_channel *chan = NULL;
- struct rxrpc_net *rxnet = conn->params.local->rxnet;
- unsigned int channel = -1;
+ struct rxrpc_net *rxnet = bundle->params.local->rxnet;
+ unsigned int channel;
+ bool may_reuse;
u32 cid;
- spin_lock(&conn->channel_lock);
- set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
+ _enter("c=%x", call->debug_id);
- cid = call->cid;
- if (cid) {
- channel = cid & RXRPC_CHANNELMASK;
- chan = &conn->channels[channel];
- }
- trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
+ spin_lock(&bundle->channel_lock);
+ set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
/* Calls that have never actually been assigned a channel can simply be
- * discarded. If the conn didn't get used either, it will follow
- * immediately unless someone else grabs it in the meantime.
+ * discarded.
*/
- if (!list_empty(&call->chan_wait_link)) {
+ conn = call->conn;
+ if (!conn) {
_debug("call is waiting");
ASSERTCMP(call->call_id, ==, 0);
ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
list_del_init(&call->chan_wait_link);
-
- trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted);
-
- /* We must deactivate or idle the connection if it's now
- * waiting for nothing.
- */
- spin_lock(&rxnet->client_conn_cache_lock);
- if (conn->cache_state == RXRPC_CONN_CLIENT_WAITING &&
- list_empty(&conn->waiting_calls) &&
- !conn->active_chans)
- goto idle_connection;
goto out;
}
+ cid = call->cid;
+ channel = cid & RXRPC_CHANNELMASK;
+ chan = &conn->channels[channel];
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
+
if (rcu_access_pointer(chan->call) != call) {
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&bundle->channel_lock);
BUG();
}
+ may_reuse = rxrpc_may_reuse_conn(conn);
+
/* If a client call was exposed to the world, we save the result for
* retransmission.
*
@@ -841,14 +829,21 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
_debug("exposed %u,%u", call->call_id, call->abort_code);
__rxrpc_disconnect_call(conn, call);
+
+ if (test_and_clear_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) {
+ trace_rxrpc_client(conn, channel, rxrpc_client_to_active);
+ bundle->try_upgrade = false;
+ if (may_reuse)
+ rxrpc_activate_channels_locked(bundle);
+ }
+
}
/* See if we can pass the channel directly to another call. */
- if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE &&
- !list_empty(&conn->waiting_calls)) {
+ if (may_reuse && !list_empty(&bundle->waiting_calls)) {
trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
rxrpc_activate_one_channel(conn, channel);
- goto out_2;
+ goto out;
}
/* Schedule the final ACK to be transmitted in a short while so that it
@@ -865,128 +860,99 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
rxrpc_reduce_conn_timer(conn, final_ack_at);
}
- /* Things are more complex and we need the cache lock. We might be
- * able to simply idle the conn or it might now be lurking on the wait
- * list. It might even get moved back to the active list whilst we're
- * waiting for the lock.
- */
- spin_lock(&rxnet->client_conn_cache_lock);
-
- switch (conn->cache_state) {
- case RXRPC_CONN_CLIENT_UPGRADE:
- /* Deal with termination of a service upgrade probe. */
- if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
- clear_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags);
- trace_rxrpc_client(conn, channel, rxrpc_client_to_active);
- conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE;
- rxrpc_activate_channels_locked(conn);
- }
- fallthrough;
- case RXRPC_CONN_CLIENT_ACTIVE:
- if (list_empty(&conn->waiting_calls)) {
- rxrpc_deactivate_one_channel(conn, channel);
- if (!conn->active_chans) {
- rxnet->nr_active_client_conns--;
- goto idle_connection;
- }
- goto out;
- }
-
- trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
- rxrpc_activate_one_channel(conn, channel);
- goto out;
+ /* Deactivate the channel. */
+ rcu_assign_pointer(chan->call, NULL);
+ set_bit(conn->bundle_shift + channel, &conn->bundle->avail_chans);
+ conn->act_chans &= ~(1 << channel);
- case RXRPC_CONN_CLIENT_CULLED:
- rxrpc_deactivate_one_channel(conn, channel);
- ASSERT(list_empty(&conn->waiting_calls));
- if (!conn->active_chans)
- goto idle_connection;
- goto out;
+ /* If no channels remain active, then put the connection on the idle
+ * list for a short while. Give it a ref to stop it going away if it
+ * becomes unbundled.
+ */
+ if (!conn->act_chans) {
+ trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
+ conn->idle_timestamp = jiffies;
- case RXRPC_CONN_CLIENT_WAITING:
- rxrpc_deactivate_one_channel(conn, channel);
- goto out;
+ rxrpc_get_connection(conn);
+ spin_lock(&rxnet->client_conn_cache_lock);
+ list_move_tail(&conn->cache_link, &rxnet->idle_client_conns);
+ spin_unlock(&rxnet->client_conn_cache_lock);
- default:
- BUG();
+ rxrpc_set_client_reap_timer(rxnet);
}
out:
- spin_unlock(&rxnet->client_conn_cache_lock);
-out_2:
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&bundle->channel_lock);
_leave("");
return;
+}
-idle_connection:
- /* As no channels remain active, the connection gets deactivated
- * immediately or moved to the idle list for a short while.
- */
- if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
- trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
- conn->idle_timestamp = jiffies;
- conn->cache_state = RXRPC_CONN_CLIENT_IDLE;
- list_move_tail(&conn->cache_link, &rxnet->idle_client_conns);
- if (rxnet->idle_client_conns.next == &conn->cache_link &&
- !rxnet->kill_all_client_conns)
- rxrpc_set_client_reap_timer(rxnet);
- } else {
- trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive);
- conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
- list_del_init(&conn->cache_link);
+/*
+ * Remove a connection from a bundle.
+ */
+static void rxrpc_unbundle_conn(struct rxrpc_connection *conn)
+{
+ struct rxrpc_bundle *bundle = conn->bundle;
+ struct rxrpc_local *local = bundle->params.local;
+ unsigned int bindex;
+ bool need_drop = false, need_put = false;
+ int i;
+
+ _enter("C=%x", conn->debug_id);
+
+ if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK)
+ rxrpc_process_delayed_final_acks(conn, true);
+
+ spin_lock(&bundle->channel_lock);
+ bindex = conn->bundle_shift / RXRPC_MAXCALLS;
+ if (bundle->conns[bindex] == conn) {
+ _debug("clear slot %u", bindex);
+ bundle->conns[bindex] = NULL;
+ for (i = 0; i < RXRPC_MAXCALLS; i++)
+ clear_bit(conn->bundle_shift + i, &bundle->avail_chans);
+ need_drop = true;
}
- goto out;
+ spin_unlock(&bundle->channel_lock);
+
+ /* If there are no more connections, remove the bundle */
+ if (!bundle->avail_chans) {
+ _debug("maybe unbundle");
+ spin_lock(&local->client_bundles_lock);
+
+ for (i = 0; i < ARRAY_SIZE(bundle->conns); i++)
+ if (bundle->conns[i])
+ break;
+ if (i == ARRAY_SIZE(bundle->conns) && !bundle->params.exclusive) {
+ _debug("erase bundle");
+ rb_erase(&bundle->local_node, &local->client_bundles);
+ need_put = true;
+ }
+
+ spin_unlock(&local->client_bundles_lock);
+ if (need_put)
+ rxrpc_put_bundle(bundle);
+ }
+
+ if (need_drop)
+ rxrpc_put_connection(conn);
+ _leave("");
}
/*
* Clean up a dead client connection.
*/
-static struct rxrpc_connection *
-rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
+static void rxrpc_kill_client_conn(struct rxrpc_connection *conn)
{
- struct rxrpc_connection *next = NULL;
struct rxrpc_local *local = conn->params.local;
struct rxrpc_net *rxnet = local->rxnet;
- unsigned int nr_conns;
- trace_rxrpc_client(conn, -1, rxrpc_client_cleanup);
+ _enter("C=%x", conn->debug_id);
- if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) {
- spin_lock(&local->client_conns_lock);
- if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS,
- &conn->flags))
- rb_erase(&conn->client_node, &local->client_conns);
- spin_unlock(&local->client_conns_lock);
- }
+ trace_rxrpc_client(conn, -1, rxrpc_client_cleanup);
+ atomic_dec(&rxnet->nr_client_conns);
rxrpc_put_client_connection_id(conn);
-
- ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE);
-
- if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
- trace_rxrpc_client(conn, -1, rxrpc_client_uncount);
- spin_lock(&rxnet->client_conn_cache_lock);
- nr_conns = --rxnet->nr_client_conns;
-
- if (nr_conns < rxrpc_max_client_connections &&
- !list_empty(&rxnet->waiting_client_conns)) {
- next = list_entry(rxnet->waiting_client_conns.next,
- struct rxrpc_connection, cache_link);
- rxrpc_get_connection(next);
- rxrpc_activate_conn(rxnet, next);
- }
-
- spin_unlock(&rxnet->client_conn_cache_lock);
- }
-
rxrpc_kill_connection(conn);
- if (next)
- rxrpc_activate_channels(next);
-
- /* We need to get rid of the temporary ref we took upon next, but we
- * can't call rxrpc_put_connection() recursively.
- */
- return next;
}
/*
@@ -998,63 +964,12 @@ void rxrpc_put_client_conn(struct rxrpc_connection *conn)
unsigned int debug_id = conn->debug_id;
int n;
- do {
- n = atomic_dec_return(&conn->usage);
- trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here);
- if (n > 0)
- return;
+ n = atomic_dec_return(&conn->usage);
+ trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here);
+ if (n <= 0) {
ASSERTCMP(n, >=, 0);
-
- conn = rxrpc_put_one_client_conn(conn);
- } while (conn);
-}
-
-/*
- * Kill the longest-active client connections to make room for new ones.
- */
-static void rxrpc_cull_active_client_conns(struct rxrpc_net *rxnet)
-{
- struct rxrpc_connection *conn;
- unsigned int nr_conns = rxnet->nr_client_conns;
- unsigned int nr_active, limit;
-
- _enter("");
-
- ASSERTCMP(nr_conns, >=, 0);
- if (nr_conns < rxrpc_max_client_connections) {
- _leave(" [ok]");
- return;
+ rxrpc_kill_client_conn(conn);
}
- limit = rxrpc_reap_client_connections;
-
- spin_lock(&rxnet->client_conn_cache_lock);
- nr_active = rxnet->nr_active_client_conns;
-
- while (nr_active > limit) {
- ASSERT(!list_empty(&rxnet->active_client_conns));
- conn = list_entry(rxnet->active_client_conns.next,
- struct rxrpc_connection, cache_link);
- ASSERTIFCMP(conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE,
- conn->cache_state, ==, RXRPC_CONN_CLIENT_UPGRADE);
-
- if (list_empty(&conn->waiting_calls)) {
- trace_rxrpc_client(conn, -1, rxrpc_client_to_culled);
- conn->cache_state = RXRPC_CONN_CLIENT_CULLED;
- list_del_init(&conn->cache_link);
- } else {
- trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
- conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
- list_move_tail(&conn->cache_link,
- &rxnet->waiting_client_conns);
- }
-
- nr_active--;
- }
-
- rxnet->nr_active_client_conns = nr_active;
- spin_unlock(&rxnet->client_conn_cache_lock);
- ASSERTCMP(nr_active, >=, 0);
- _leave(" [culled]");
}
/*
@@ -1088,7 +1003,7 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work)
/* We keep an estimate of what the number of conns ought to be after
* we've discarded some so that we don't overdo the discarding.
*/
- nr_conns = rxnet->nr_client_conns;
+ nr_conns = atomic_read(&rxnet->nr_client_conns);
next:
spin_lock(&rxnet->client_conn_cache_lock);
@@ -1098,7 +1013,6 @@ next:
conn = list_entry(rxnet->idle_client_conns.next,
struct rxrpc_connection, cache_link);
- ASSERT(test_bit(RXRPC_CONN_EXPOSED, &conn->flags));
if (!rxnet->kill_all_client_conns) {
/* If the number of connections is over the reap limit, we
@@ -1120,18 +1034,13 @@ next:
}
trace_rxrpc_client(conn, -1, rxrpc_client_discard);
- if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
- BUG();
- conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
list_del_init(&conn->cache_link);
spin_unlock(&rxnet->client_conn_cache_lock);
- /* When we cleared the EXPOSED flag, we took on responsibility for the
- * reference that that had on the usage count. We deal with that here.
- * If someone re-sets the flag and re-gets the ref, that's fine.
- */
- rxrpc_put_connection(conn);
+ rxrpc_unbundle_conn(conn);
+ rxrpc_put_connection(conn); /* Drop the ->cache_link ref */
+
nr_conns--;
goto next;
@@ -1145,8 +1054,7 @@ not_yet_expired:
*/
_debug("not yet");
if (!rxnet->kill_all_client_conns)
- timer_reduce(&rxnet->client_conn_reap_timer,
- conn_expires_at);
+ timer_reduce(&rxnet->client_conn_reap_timer, conn_expires_at);
out:
spin_unlock(&rxnet->client_conn_cache_lock);
@@ -1181,37 +1089,27 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local)
{
struct rxrpc_connection *conn, *tmp;
struct rxrpc_net *rxnet = local->rxnet;
- unsigned int nr_active;
LIST_HEAD(graveyard);
_enter("");
spin_lock(&rxnet->client_conn_cache_lock);
- nr_active = rxnet->nr_active_client_conns;
list_for_each_entry_safe(conn, tmp, &rxnet->idle_client_conns,
cache_link) {
if (conn->params.local == local) {
- ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_IDLE);
-
trace_rxrpc_client(conn, -1, rxrpc_client_discard);
- if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
- BUG();
- conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
list_move(&conn->cache_link, &graveyard);
- nr_active--;
}
}
- rxnet->nr_active_client_conns = nr_active;
spin_unlock(&rxnet->client_conn_cache_lock);
- ASSERTCMP(nr_active, >=, 0);
while (!list_empty(&graveyard)) {
conn = list_entry(graveyard.next,
struct rxrpc_connection, cache_link);
list_del_init(&conn->cache_link);
-
+ rxrpc_unbundle_conn(conn);
rxrpc_put_connection(conn);
}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 64ace2960ecc..aab069701398 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -157,12 +157,12 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
_enter("{%d},%x", conn->debug_id, conn->abort_code);
- spin_lock(&conn->channel_lock);
+ spin_lock(&conn->bundle->channel_lock);
for (i = 0; i < RXRPC_MAXCALLS; i++) {
call = rcu_dereference_protected(
conn->channels[i].call,
- lockdep_is_held(&conn->channel_lock));
+ lockdep_is_held(&conn->bundle->channel_lock));
if (call) {
if (compl == RXRPC_CALL_LOCALLY_ABORTED)
trace_rxrpc_abort(call->debug_id,
@@ -179,7 +179,7 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
}
}
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&conn->bundle->channel_lock);
_leave("");
}
@@ -210,6 +210,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
conn->error = error;
conn->abort_code = abort_code;
conn->state = RXRPC_CONN_LOCALLY_ABORTED;
+ set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
spin_unlock_bh(&conn->state_lock);
msg.msg_name = &conn->params.peer->srx.transport;
@@ -319,6 +320,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
conn->error = -ECONNABORTED;
conn->abort_code = abort_code;
conn->state = RXRPC_CONN_REMOTELY_ABORTED;
+ set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial);
return -ECONNABORTED;
@@ -331,15 +333,12 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
if (ret < 0)
return ret;
- ret = conn->security->init_connection_security(conn);
+ ret = conn->security->init_connection_security(
+ conn, conn->params.key->payload.data[0]);
if (ret < 0)
return ret;
- ret = conn->security->prime_packet_security(conn);
- if (ret < 0)
- return ret;
-
- spin_lock(&conn->channel_lock);
+ spin_lock(&conn->bundle->channel_lock);
spin_lock_bh(&conn->state_lock);
if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) {
@@ -349,12 +348,12 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
rxrpc_call_is_secure(
rcu_dereference_protected(
conn->channels[loop].call,
- lockdep_is_held(&conn->channel_lock)));
+ lockdep_is_held(&conn->bundle->channel_lock)));
} else {
spin_unlock_bh(&conn->state_lock);
}
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&conn->bundle->channel_lock);
return 0;
default:
@@ -375,7 +374,6 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn)
_enter("{%d}", conn->debug_id);
ASSERT(conn->security_ix != 0);
- ASSERT(conn->server_key);
if (conn->security->issue_challenge(conn) < 0) {
abort_code = RX_CALL_DEAD;
@@ -395,7 +393,7 @@ abort:
/*
* Process delayed final ACKs that we haven't subsumed into a subsequent call.
*/
-static void rxrpc_process_delayed_final_acks(struct rxrpc_connection *conn)
+void rxrpc_process_delayed_final_acks(struct rxrpc_connection *conn, bool force)
{
unsigned long j = jiffies, next_j;
unsigned int channel;
@@ -414,7 +412,7 @@ again:
smp_rmb(); /* vs rxrpc_disconnect_client_call */
ack_at = READ_ONCE(chan->final_ack_at);
- if (time_before(j, ack_at)) {
+ if (time_before(j, ack_at) && !force) {
if (time_before(ack_at, next_j)) {
next_j = ack_at;
set = true;
@@ -448,7 +446,7 @@ static void rxrpc_do_process_connection(struct rxrpc_connection *conn)
/* Process delayed ACKs whose time has come. */
if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK)
- rxrpc_process_delayed_final_acks(conn);
+ rxrpc_process_delayed_final_acks(conn, false);
/* go through the conn-level event packets, releasing the ref on this
* connection that each one has when we've finished with it */
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 8cbe0bf20ed5..b2159dbf5412 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -41,8 +41,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
conn = kzalloc(sizeof(struct rxrpc_connection), gfp);
if (conn) {
INIT_LIST_HEAD(&conn->cache_link);
- spin_lock_init(&conn->channel_lock);
- INIT_LIST_HEAD(&conn->waiting_calls);
timer_setup(&conn->timer, &rxrpc_connection_timer, 0);
INIT_WORK(&conn->processor, &rxrpc_process_connection);
INIT_LIST_HEAD(&conn->proc_link);
@@ -51,7 +49,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
conn->security = &rxrpc_no_security;
spin_lock_init(&conn->state_lock);
conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
- conn->size_align = 4;
conn->idle_timestamp = jiffies;
}
@@ -219,11 +216,11 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
}
if (rxrpc_is_client_call(call))
- return rxrpc_disconnect_client_call(call);
+ return rxrpc_disconnect_client_call(conn->bundle, call);
- spin_lock(&conn->channel_lock);
+ spin_lock(&conn->bundle->channel_lock);
__rxrpc_disconnect_call(conn, call);
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&conn->bundle->channel_lock);
set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
conn->idle_timestamp = jiffies;
@@ -292,12 +289,13 @@ void rxrpc_see_connection(struct rxrpc_connection *conn)
/*
* Get a ref on a connection.
*/
-void rxrpc_get_connection(struct rxrpc_connection *conn)
+struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *conn)
{
const void *here = __builtin_return_address(0);
int n = atomic_inc_return(&conn->usage);
trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n, here);
+ return conn;
}
/*
@@ -364,7 +362,7 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu)
conn->security->clear(conn);
key_put(conn->params.key);
- key_put(conn->server_key);
+ rxrpc_put_bundle(conn->bundle);
rxrpc_put_peer(conn->params.peer);
if (atomic_dec_and_test(&conn->params.local->rxnet->nr_conns))
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index 21da48e3d2e5..e1966dfc9152 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -8,6 +8,12 @@
#include <linux/slab.h>
#include "ar-internal.h"
+static struct rxrpc_bundle rxrpc_service_dummy_bundle = {
+ .usage = ATOMIC_INIT(1),
+ .debug_id = UINT_MAX,
+ .channel_lock = __SPIN_LOCK_UNLOCKED(&rxrpc_service_dummy_bundle.channel_lock),
+};
+
/*
* Find a service connection under RCU conditions.
*
@@ -127,6 +133,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn
*/
conn->state = RXRPC_CONN_SERVICE_PREALLOC;
atomic_set(&conn->usage, 2);
+ conn->bundle = rxrpc_get_bundle(&rxrpc_service_dummy_bundle);
atomic_inc(&rxnet->nr_conns);
write_lock(&rxnet->conn_lock);
@@ -149,7 +156,6 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn
void rxrpc_new_incoming_connection(struct rxrpc_sock *rx,
struct rxrpc_connection *conn,
const struct rxrpc_security *sec,
- struct key *key,
struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
@@ -163,7 +169,6 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx,
conn->security_ix = sp->hdr.securityIndex;
conn->out_clientflag = 0;
conn->security = sec;
- conn->server_key = key_get(key);
if (conn->security_ix)
conn->state = RXRPC_CONN_SERVICE_UNSECURED;
else
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 667c44aa5a63..dc201363f2c4 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -430,7 +430,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb)
return;
}
- if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) {
+ if (state == RXRPC_CALL_SERVER_RECV_REQUEST) {
unsigned long timo = READ_ONCE(call->next_req_timo);
unsigned long now, expect_req_by;
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index f6c59f5fae9d..9aae99d67833 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -8,20 +8,25 @@
#include <net/af_rxrpc.h>
#include "ar-internal.h"
-static int none_init_connection_security(struct rxrpc_connection *conn)
+static int none_init_connection_security(struct rxrpc_connection *conn,
+ struct rxrpc_key_token *token)
{
return 0;
}
-static int none_prime_packet_security(struct rxrpc_connection *conn)
+/*
+ * Work out how much data we can put in an unsecured packet.
+ */
+static int none_how_much_data(struct rxrpc_call *call, size_t remain,
+ size_t *_buf_size, size_t *_data_size, size_t *_offset)
{
+ *_buf_size = *_data_size = min_t(size_t, remain, RXRPC_JUMBO_DATALEN);
+ *_offset = 0;
return 0;
}
-static int none_secure_packet(struct rxrpc_call *call,
- struct sk_buff *skb,
- size_t data_size,
- void *sechdr)
+static int none_secure_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ size_t data_size)
{
return 0;
}
@@ -86,8 +91,8 @@ const struct rxrpc_security rxrpc_no_security = {
.init = none_init,
.exit = none_exit,
.init_connection_security = none_init_connection_security,
- .prime_packet_security = none_prime_packet_security,
.free_call_crypto = none_free_call_crypto,
+ .how_much_data = none_how_much_data,
.secure_packet = none_secure_packet,
.verify_packet = none_verify_packet,
.locate_data = none_locate_data,
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 2e8bd3b97301..8d2073e0e3da 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -5,7 +5,7 @@
* Written by David Howells (dhowells@redhat.com)
*
* RxRPC keys should have a description of describing their purpose:
- * "afs@CAMBRIDGE.REDHAT.COM>
+ * "afs@example.com"
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -23,13 +23,9 @@
#include <keys/user-type.h>
#include "ar-internal.h"
-static int rxrpc_vet_description_s(const char *);
static int rxrpc_preparse(struct key_preparsed_payload *);
-static int rxrpc_preparse_s(struct key_preparsed_payload *);
static void rxrpc_free_preparse(struct key_preparsed_payload *);
-static void rxrpc_free_preparse_s(struct key_preparsed_payload *);
static void rxrpc_destroy(struct key *);
-static void rxrpc_destroy_s(struct key *);
static void rxrpc_describe(const struct key *, struct seq_file *);
static long rxrpc_read(const struct key *, char *, size_t);
@@ -50,38 +46,6 @@ struct key_type key_type_rxrpc = {
EXPORT_SYMBOL(key_type_rxrpc);
/*
- * rxrpc server defined keys take "<serviceId>:<securityIndex>" as the
- * description and an 8-byte decryption key as the payload
- */
-struct key_type key_type_rxrpc_s = {
- .name = "rxrpc_s",
- .flags = KEY_TYPE_NET_DOMAIN,
- .vet_description = rxrpc_vet_description_s,
- .preparse = rxrpc_preparse_s,
- .free_preparse = rxrpc_free_preparse_s,
- .instantiate = generic_key_instantiate,
- .destroy = rxrpc_destroy_s,
- .describe = rxrpc_describe,
-};
-
-/*
- * Vet the description for an RxRPC server key
- */
-static int rxrpc_vet_description_s(const char *desc)
-{
- unsigned long num;
- char *p;
-
- num = simple_strtoul(desc, &p, 10);
- if (*p != ':' || num > 65535)
- return -EINVAL;
- num = simple_strtoul(p + 1, &p, 10);
- if (*p || num < 1 || num > 255)
- return -EINVAL;
- return 0;
-}
-
-/*
* parse an RxKAD type XDR format token
* - the caller guarantees we have at least 4 words
*/
@@ -165,402 +129,17 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep,
return 0;
}
-static void rxrpc_free_krb5_principal(struct krb5_principal *princ)
-{
- int loop;
-
- if (princ->name_parts) {
- for (loop = princ->n_name_parts - 1; loop >= 0; loop--)
- kfree(princ->name_parts[loop]);
- kfree(princ->name_parts);
- }
- kfree(princ->realm);
-}
-
-static void rxrpc_free_krb5_tagged(struct krb5_tagged_data *td)
-{
- kfree(td->data);
-}
-
-/*
- * free up an RxK5 token
- */
-static void rxrpc_rxk5_free(struct rxk5_key *rxk5)
-{
- int loop;
-
- rxrpc_free_krb5_principal(&rxk5->client);
- rxrpc_free_krb5_principal(&rxk5->server);
- rxrpc_free_krb5_tagged(&rxk5->session);
-
- if (rxk5->addresses) {
- for (loop = rxk5->n_addresses - 1; loop >= 0; loop--)
- rxrpc_free_krb5_tagged(&rxk5->addresses[loop]);
- kfree(rxk5->addresses);
- }
- if (rxk5->authdata) {
- for (loop = rxk5->n_authdata - 1; loop >= 0; loop--)
- rxrpc_free_krb5_tagged(&rxk5->authdata[loop]);
- kfree(rxk5->authdata);
- }
-
- kfree(rxk5->ticket);
- kfree(rxk5->ticket2);
- kfree(rxk5);
-}
-
-/*
- * extract a krb5 principal
- */
-static int rxrpc_krb5_decode_principal(struct krb5_principal *princ,
- const __be32 **_xdr,
- unsigned int *_toklen)
-{
- const __be32 *xdr = *_xdr;
- unsigned int toklen = *_toklen, n_parts, loop, tmp, paddedlen;
-
- /* there must be at least one name, and at least #names+1 length
- * words */
- if (toklen <= 12)
- return -EINVAL;
-
- _enter(",{%x,%x,%x},%u",
- ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), toklen);
-
- n_parts = ntohl(*xdr++);
- toklen -= 4;
- if (n_parts <= 0 || n_parts > AFSTOKEN_K5_COMPONENTS_MAX)
- return -EINVAL;
- princ->n_name_parts = n_parts;
-
- if (toklen <= (n_parts + 1) * 4)
- return -EINVAL;
-
- princ->name_parts = kcalloc(n_parts, sizeof(char *), GFP_KERNEL);
- if (!princ->name_parts)
- return -ENOMEM;
-
- for (loop = 0; loop < n_parts; loop++) {
- if (toklen < 4)
- return -EINVAL;
- tmp = ntohl(*xdr++);
- toklen -= 4;
- if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX)
- return -EINVAL;
- paddedlen = (tmp + 3) & ~3;
- if (paddedlen > toklen)
- return -EINVAL;
- princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL);
- if (!princ->name_parts[loop])
- return -ENOMEM;
- memcpy(princ->name_parts[loop], xdr, tmp);
- princ->name_parts[loop][tmp] = 0;
- toklen -= paddedlen;
- xdr += paddedlen >> 2;
- }
-
- if (toklen < 4)
- return -EINVAL;
- tmp = ntohl(*xdr++);
- toklen -= 4;
- if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX)
- return -EINVAL;
- paddedlen = (tmp + 3) & ~3;
- if (paddedlen > toklen)
- return -EINVAL;
- princ->realm = kmalloc(tmp + 1, GFP_KERNEL);
- if (!princ->realm)
- return -ENOMEM;
- memcpy(princ->realm, xdr, tmp);
- princ->realm[tmp] = 0;
- toklen -= paddedlen;
- xdr += paddedlen >> 2;
-
- _debug("%s/...@%s", princ->name_parts[0], princ->realm);
-
- *_xdr = xdr;
- *_toklen = toklen;
- _leave(" = 0 [toklen=%u]", toklen);
- return 0;
-}
-
-/*
- * extract a piece of krb5 tagged data
- */
-static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td,
- size_t max_data_size,
- const __be32 **_xdr,
- unsigned int *_toklen)
-{
- const __be32 *xdr = *_xdr;
- unsigned int toklen = *_toklen, len, paddedlen;
-
- /* there must be at least one tag and one length word */
- if (toklen <= 8)
- return -EINVAL;
-
- _enter(",%zu,{%x,%x},%u",
- max_data_size, ntohl(xdr[0]), ntohl(xdr[1]), toklen);
-
- td->tag = ntohl(*xdr++);
- len = ntohl(*xdr++);
- toklen -= 8;
- if (len > max_data_size)
- return -EINVAL;
- paddedlen = (len + 3) & ~3;
- if (paddedlen > toklen)
- return -EINVAL;
- td->data_len = len;
-
- if (len > 0) {
- td->data = kmemdup(xdr, len, GFP_KERNEL);
- if (!td->data)
- return -ENOMEM;
- toklen -= paddedlen;
- xdr += paddedlen >> 2;
- }
-
- _debug("tag %x len %x", td->tag, td->data_len);
-
- *_xdr = xdr;
- *_toklen = toklen;
- _leave(" = 0 [toklen=%u]", toklen);
- return 0;
-}
-
-/*
- * extract an array of tagged data
- */
-static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td,
- u8 *_n_elem,
- u8 max_n_elem,
- size_t max_elem_size,
- const __be32 **_xdr,
- unsigned int *_toklen)
-{
- struct krb5_tagged_data *td;
- const __be32 *xdr = *_xdr;
- unsigned int toklen = *_toklen, n_elem, loop;
- int ret;
-
- /* there must be at least one count */
- if (toklen < 4)
- return -EINVAL;
-
- _enter(",,%u,%zu,{%x},%u",
- max_n_elem, max_elem_size, ntohl(xdr[0]), toklen);
-
- n_elem = ntohl(*xdr++);
- toklen -= 4;
- if (n_elem > max_n_elem)
- return -EINVAL;
- *_n_elem = n_elem;
- if (n_elem > 0) {
- if (toklen <= (n_elem + 1) * 4)
- return -EINVAL;
-
- _debug("n_elem %d", n_elem);
-
- td = kcalloc(n_elem, sizeof(struct krb5_tagged_data),
- GFP_KERNEL);
- if (!td)
- return -ENOMEM;
- *_td = td;
-
- for (loop = 0; loop < n_elem; loop++) {
- ret = rxrpc_krb5_decode_tagged_data(&td[loop],
- max_elem_size,
- &xdr, &toklen);
- if (ret < 0)
- return ret;
- }
- }
-
- *_xdr = xdr;
- *_toklen = toklen;
- _leave(" = 0 [toklen=%u]", toklen);
- return 0;
-}
-
-/*
- * extract a krb5 ticket
- */
-static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen,
- const __be32 **_xdr, unsigned int *_toklen)
-{
- const __be32 *xdr = *_xdr;
- unsigned int toklen = *_toklen, len, paddedlen;
-
- /* there must be at least one length word */
- if (toklen <= 4)
- return -EINVAL;
-
- _enter(",{%x},%u", ntohl(xdr[0]), toklen);
-
- len = ntohl(*xdr++);
- toklen -= 4;
- if (len > AFSTOKEN_K5_TIX_MAX)
- return -EINVAL;
- paddedlen = (len + 3) & ~3;
- if (paddedlen > toklen)
- return -EINVAL;
- *_tktlen = len;
-
- _debug("ticket len %u", len);
-
- if (len > 0) {
- *_ticket = kmemdup(xdr, len, GFP_KERNEL);
- if (!*_ticket)
- return -ENOMEM;
- toklen -= paddedlen;
- xdr += paddedlen >> 2;
- }
-
- *_xdr = xdr;
- *_toklen = toklen;
- _leave(" = 0 [toklen=%u]", toklen);
- return 0;
-}
-
-/*
- * parse an RxK5 type XDR format token
- * - the caller guarantees we have at least 4 words
- */
-static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep,
- size_t datalen,
- const __be32 *xdr, unsigned int toklen)
-{
- struct rxrpc_key_token *token, **pptoken;
- struct rxk5_key *rxk5;
- const __be32 *end_xdr = xdr + (toklen >> 2);
- time64_t expiry;
- int ret;
-
- _enter(",{%x,%x,%x,%x},%u",
- ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
- toklen);
-
- /* reserve some payload space for this subkey - the length of the token
- * is a reasonable approximation */
- prep->quotalen = datalen + toklen;
-
- token = kzalloc(sizeof(*token), GFP_KERNEL);
- if (!token)
- return -ENOMEM;
-
- rxk5 = kzalloc(sizeof(*rxk5), GFP_KERNEL);
- if (!rxk5) {
- kfree(token);
- return -ENOMEM;
- }
-
- token->security_index = RXRPC_SECURITY_RXK5;
- token->k5 = rxk5;
-
- /* extract the principals */
- ret = rxrpc_krb5_decode_principal(&rxk5->client, &xdr, &toklen);
- if (ret < 0)
- goto error;
- ret = rxrpc_krb5_decode_principal(&rxk5->server, &xdr, &toklen);
- if (ret < 0)
- goto error;
-
- /* extract the session key and the encoding type (the tag field ->
- * ENCTYPE_xxx) */
- ret = rxrpc_krb5_decode_tagged_data(&rxk5->session, AFSTOKEN_DATA_MAX,
- &xdr, &toklen);
- if (ret < 0)
- goto error;
-
- if (toklen < 4 * 8 + 2 * 4)
- goto inval;
- rxk5->authtime = be64_to_cpup((const __be64 *) xdr);
- xdr += 2;
- rxk5->starttime = be64_to_cpup((const __be64 *) xdr);
- xdr += 2;
- rxk5->endtime = be64_to_cpup((const __be64 *) xdr);
- xdr += 2;
- rxk5->renew_till = be64_to_cpup((const __be64 *) xdr);
- xdr += 2;
- rxk5->is_skey = ntohl(*xdr++);
- rxk5->flags = ntohl(*xdr++);
- toklen -= 4 * 8 + 2 * 4;
-
- _debug("times: a=%llx s=%llx e=%llx rt=%llx",
- rxk5->authtime, rxk5->starttime, rxk5->endtime,
- rxk5->renew_till);
- _debug("is_skey=%x flags=%x", rxk5->is_skey, rxk5->flags);
-
- /* extract the permitted client addresses */
- ret = rxrpc_krb5_decode_tagged_array(&rxk5->addresses,
- &rxk5->n_addresses,
- AFSTOKEN_K5_ADDRESSES_MAX,
- AFSTOKEN_DATA_MAX,
- &xdr, &toklen);
- if (ret < 0)
- goto error;
-
- ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
-
- /* extract the tickets */
- ret = rxrpc_krb5_decode_ticket(&rxk5->ticket, &rxk5->ticket_len,
- &xdr, &toklen);
- if (ret < 0)
- goto error;
- ret = rxrpc_krb5_decode_ticket(&rxk5->ticket2, &rxk5->ticket2_len,
- &xdr, &toklen);
- if (ret < 0)
- goto error;
-
- ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
-
- /* extract the typed auth data */
- ret = rxrpc_krb5_decode_tagged_array(&rxk5->authdata,
- &rxk5->n_authdata,
- AFSTOKEN_K5_AUTHDATA_MAX,
- AFSTOKEN_BDATALN_MAX,
- &xdr, &toklen);
- if (ret < 0)
- goto error;
-
- ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
-
- if (toklen != 0)
- goto inval;
-
- /* attach the payload */
- for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0];
- *pptoken;
- pptoken = &(*pptoken)->next)
- continue;
- *pptoken = token;
- expiry = rxrpc_u32_to_time64(token->k5->endtime);
- if (expiry < prep->expiry)
- prep->expiry = expiry;
-
- _leave(" = 0");
- return 0;
-
-inval:
- ret = -EINVAL;
-error:
- rxrpc_rxk5_free(rxk5);
- kfree(token);
- _leave(" = %d", ret);
- return ret;
-}
-
/*
* attempt to parse the data as the XDR format
* - the caller guarantees we have more than 7 words
*/
static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
{
- const __be32 *xdr = prep->data, *token;
+ const __be32 *xdr = prep->data, *token, *p;
const char *cp;
unsigned int len, paddedlen, loop, ntoken, toklen, sec_ix;
size_t datalen = prep->datalen;
- int ret;
+ int ret, ret2;
_enter(",{%x,%x,%x,%x},%zu",
ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
@@ -610,20 +189,20 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
goto not_xdr;
/* check each token wrapper */
- token = xdr;
+ p = xdr;
loop = ntoken;
do {
if (datalen < 8)
goto not_xdr;
- toklen = ntohl(*xdr++);
- sec_ix = ntohl(*xdr);
+ toklen = ntohl(*p++);
+ sec_ix = ntohl(*p);
datalen -= 4;
_debug("token: [%x/%zx] %x", toklen, datalen, sec_ix);
paddedlen = (toklen + 3) & ~3;
if (toklen < 20 || toklen > datalen || paddedlen > datalen)
goto not_xdr;
datalen -= paddedlen;
- xdr += paddedlen >> 2;
+ p += paddedlen >> 2;
} while (--loop > 0);
@@ -634,44 +213,50 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
/* okay: we're going to assume it's valid XDR format
* - we ignore the cellname, relying on the key to be correctly named
*/
+ ret = -EPROTONOSUPPORT;
do {
- xdr = token;
toklen = ntohl(*xdr++);
- token = xdr + ((toklen + 3) >> 2);
- sec_ix = ntohl(*xdr++);
+ token = xdr;
+ xdr += (toklen + 3) / 4;
+
+ sec_ix = ntohl(*token++);
toklen -= 4;
- _debug("TOKEN type=%u [%p-%p]", sec_ix, xdr, token);
+ _debug("TOKEN type=%x len=%x", sec_ix, toklen);
switch (sec_ix) {
case RXRPC_SECURITY_RXKAD:
- ret = rxrpc_preparse_xdr_rxkad(prep, datalen, xdr, toklen);
- if (ret != 0)
- goto error;
+ ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen);
+ break;
+ default:
+ ret2 = -EPROTONOSUPPORT;
break;
+ }
- case RXRPC_SECURITY_RXK5:
- ret = rxrpc_preparse_xdr_rxk5(prep, datalen, xdr, toklen);
+ switch (ret2) {
+ case 0:
+ ret = 0;
+ break;
+ case -EPROTONOSUPPORT:
+ break;
+ case -ENOPKG:
if (ret != 0)
- goto error;
+ ret = -ENOPKG;
break;
-
default:
- ret = -EPROTONOSUPPORT;
+ ret = ret2;
goto error;
}
} while (--ntoken > 0);
- _leave(" = 0");
- return 0;
+error:
+ _leave(" = %d", ret);
+ return ret;
not_xdr:
_leave(" = -EPROTO");
return -EPROTO;
-error:
- _leave(" = %d", ret);
- return ret;
}
/*
@@ -805,10 +390,6 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token)
case RXRPC_SECURITY_RXKAD:
kfree(token->kad);
break;
- case RXRPC_SECURITY_RXK5:
- if (token->k5)
- rxrpc_rxk5_free(token->k5);
- break;
default:
pr_err("Unknown token type %x on rxrpc key\n",
token->security_index);
@@ -828,45 +409,6 @@ static void rxrpc_free_preparse(struct key_preparsed_payload *prep)
}
/*
- * Preparse a server secret key.
- *
- * The data should be the 8-byte secret key.
- */
-static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
-{
- struct crypto_skcipher *ci;
-
- _enter("%zu", prep->datalen);
-
- if (prep->datalen != 8)
- return -EINVAL;
-
- memcpy(&prep->payload.data[2], prep->data, 8);
-
- ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(ci)) {
- _leave(" = %ld", PTR_ERR(ci));
- return PTR_ERR(ci);
- }
-
- if (crypto_skcipher_setkey(ci, prep->data, 8) < 0)
- BUG();
-
- prep->payload.data[0] = ci;
- _leave(" = 0");
- return 0;
-}
-
-/*
- * Clean up preparse data.
- */
-static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep)
-{
- if (prep->payload.data[0])
- crypto_free_skcipher(prep->payload.data[0]);
-}
-
-/*
* dispose of the data dangling from the corpse of a rxrpc key
*/
static void rxrpc_destroy(struct key *key)
@@ -875,22 +417,29 @@ static void rxrpc_destroy(struct key *key)
}
/*
- * dispose of the data dangling from the corpse of a rxrpc key
- */
-static void rxrpc_destroy_s(struct key *key)
-{
- if (key->payload.data[0]) {
- crypto_free_skcipher(key->payload.data[0]);
- key->payload.data[0] = NULL;
- }
-}
-
-/*
* describe the rxrpc key
*/
static void rxrpc_describe(const struct key *key, struct seq_file *m)
{
+ const struct rxrpc_key_token *token;
+ const char *sep = ": ";
+
seq_puts(m, key->description);
+
+ for (token = key->payload.data[0]; token; token = token->next) {
+ seq_puts(m, sep);
+
+ switch (token->security_index) {
+ case RXRPC_SECURITY_RXKAD:
+ seq_puts(m, "ka");
+ break;
+ default: /* we have a ticket we can't encode */
+ seq_printf(m, "%u", token->security_index);
+ break;
+ }
+
+ sep = " ";
+ }
}
/*
@@ -924,36 +473,6 @@ int rxrpc_request_key(struct rxrpc_sock *rx, sockptr_t optval, int optlen)
}
/*
- * grab the security keyring for a server socket
- */
-int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen)
-{
- struct key *key;
- char *description;
-
- _enter("");
-
- if (optlen <= 0 || optlen > PAGE_SIZE - 1)
- return -EINVAL;
-
- description = memdup_sockptr_nul(optval, optlen);
- if (IS_ERR(description))
- return PTR_ERR(description);
-
- key = request_key(&key_type_keyring, description, NULL);
- if (IS_ERR(key)) {
- kfree(description);
- _leave(" = %ld", PTR_ERR(key));
- return PTR_ERR(key);
- }
-
- rx->securities = key;
- kfree(description);
- _leave(" = 0 [key %x]", key->serial);
- return 0;
-}
-
-/*
* generate a server data key
*/
int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
@@ -1044,12 +563,10 @@ static long rxrpc_read(const struct key *key,
char *buffer, size_t buflen)
{
const struct rxrpc_key_token *token;
- const struct krb5_principal *princ;
size_t size;
__be32 *xdr, *oldxdr;
u32 cnlen, toksize, ntoks, tok, zero;
u16 toksizes[AFSTOKEN_MAX];
- int loop;
_enter("");
@@ -1074,42 +591,14 @@ static long rxrpc_read(const struct key *key,
case RXRPC_SECURITY_RXKAD:
toksize += 8 * 4; /* viceid, kvno, key*2, begin,
* end, primary, tktlen */
- toksize += RND(token->kad->ticket_len);
- break;
-
- case RXRPC_SECURITY_RXK5:
- princ = &token->k5->client;
- toksize += 4 + princ->n_name_parts * 4;
- for (loop = 0; loop < princ->n_name_parts; loop++)
- toksize += RND(strlen(princ->name_parts[loop]));
- toksize += 4 + RND(strlen(princ->realm));
-
- princ = &token->k5->server;
- toksize += 4 + princ->n_name_parts * 4;
- for (loop = 0; loop < princ->n_name_parts; loop++)
- toksize += RND(strlen(princ->name_parts[loop]));
- toksize += 4 + RND(strlen(princ->realm));
-
- toksize += 8 + RND(token->k5->session.data_len);
-
- toksize += 4 * 8 + 2 * 4;
-
- toksize += 4 + token->k5->n_addresses * 8;
- for (loop = 0; loop < token->k5->n_addresses; loop++)
- toksize += RND(token->k5->addresses[loop].data_len);
-
- toksize += 4 + RND(token->k5->ticket_len);
- toksize += 4 + RND(token->k5->ticket2_len);
-
- toksize += 4 + token->k5->n_authdata * 8;
- for (loop = 0; loop < token->k5->n_authdata; loop++)
- toksize += RND(token->k5->authdata[loop].data_len);
+ if (!token->no_leak_key)
+ toksize += RND(token->kad->ticket_len);
break;
default: /* we have a ticket we can't encode */
pr_err("Unsupported key token type (%u)\n",
token->security_index);
- continue;
+ return -ENOPKG;
}
_debug("token[%u]: toksize=%u", ntoks, toksize);
@@ -1178,53 +667,16 @@ static long rxrpc_read(const struct key *key,
ENCODE(token->kad->start);
ENCODE(token->kad->expiry);
ENCODE(token->kad->primary_flag);
- ENCODE_DATA(token->kad->ticket_len, token->kad->ticket);
- break;
-
- case RXRPC_SECURITY_RXK5:
- princ = &token->k5->client;
- ENCODE(princ->n_name_parts);
- for (loop = 0; loop < princ->n_name_parts; loop++)
- ENCODE_STR(princ->name_parts[loop]);
- ENCODE_STR(princ->realm);
-
- princ = &token->k5->server;
- ENCODE(princ->n_name_parts);
- for (loop = 0; loop < princ->n_name_parts; loop++)
- ENCODE_STR(princ->name_parts[loop]);
- ENCODE_STR(princ->realm);
-
- ENCODE(token->k5->session.tag);
- ENCODE_DATA(token->k5->session.data_len,
- token->k5->session.data);
-
- ENCODE64(token->k5->authtime);
- ENCODE64(token->k5->starttime);
- ENCODE64(token->k5->endtime);
- ENCODE64(token->k5->renew_till);
- ENCODE(token->k5->is_skey);
- ENCODE(token->k5->flags);
-
- ENCODE(token->k5->n_addresses);
- for (loop = 0; loop < token->k5->n_addresses; loop++) {
- ENCODE(token->k5->addresses[loop].tag);
- ENCODE_DATA(token->k5->addresses[loop].data_len,
- token->k5->addresses[loop].data);
- }
-
- ENCODE_DATA(token->k5->ticket_len, token->k5->ticket);
- ENCODE_DATA(token->k5->ticket2_len, token->k5->ticket2);
-
- ENCODE(token->k5->n_authdata);
- for (loop = 0; loop < token->k5->n_authdata; loop++) {
- ENCODE(token->k5->authdata[loop].tag);
- ENCODE_DATA(token->k5->authdata[loop].data_len,
- token->k5->authdata[loop].data);
- }
+ if (token->no_leak_key)
+ ENCODE(0);
+ else
+ ENCODE_DATA(token->kad->ticket_len, token->kad->ticket);
break;
default:
- break;
+ pr_err("Unsupported key token type (%u)\n",
+ token->security_index);
+ return -ENOPKG;
}
ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==,
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index ede058f9cc15..8c2881054266 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -86,8 +86,8 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet,
init_rwsem(&local->defrag_sem);
skb_queue_head_init(&local->reject_queue);
skb_queue_head_init(&local->event_queue);
- local->client_conns = RB_ROOT;
- spin_lock_init(&local->client_conns_lock);
+ local->client_bundles = RB_ROOT;
+ spin_lock_init(&local->client_bundles_lock);
spin_lock_init(&local->lock);
rwlock_init(&local->services_lock);
local->debug_id = atomic_inc_return(&rxrpc_debug_id);
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index b312aab80fed..25bbc4cc8b13 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -62,13 +62,10 @@ static __net_init int rxrpc_init_net(struct net *net)
timer_setup(&rxnet->service_conn_reap_timer,
rxrpc_service_conn_reap_timeout, 0);
- rxnet->nr_client_conns = 0;
- rxnet->nr_active_client_conns = 0;
+ atomic_set(&rxnet->nr_client_conns, 0);
rxnet->kill_all_client_conns = false;
spin_lock_init(&rxnet->client_conn_cache_lock);
spin_lock_init(&rxnet->client_conn_discard_lock);
- INIT_LIST_HEAD(&rxnet->waiting_client_conns);
- INIT_LIST_HEAD(&rxnet->active_client_conns);
INIT_LIST_HEAD(&rxnet->idle_client_conns);
INIT_WORK(&rxnet->client_conn_reaper,
rxrpc_discard_expired_client_conns);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 3cfff7922ba8..10f2bf2e9068 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -357,6 +357,12 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
_enter(",{%d}", skb->len);
+ if (hlist_unhashed(&call->error_link)) {
+ spin_lock_bh(&call->peer->lock);
+ hlist_add_head_rcu(&call->error_link, &call->peer->error_targets);
+ spin_unlock_bh(&call->peer->lock);
+ }
+
/* Each transmission of a Tx packet needs a new serial number */
serial = atomic_inc_return(&conn->serial);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 543afd9bd664..e2f990754f88 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -165,7 +165,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
"Proto Local "
" Remote "
" SvID ConnID End Use State Key "
- " Serial ISerial\n"
+ " Serial ISerial CallId0 CallId1 CallId2 CallId3\n"
);
return 0;
}
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 2c842851d72e..fef3573fdc8b 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -69,7 +69,7 @@ bool __rxrpc_set_call_completion(struct rxrpc_call *call,
if (call->state < RXRPC_CALL_COMPLETE) {
call->abort_code = abort_code;
call->error = error;
- call->completion = compl,
+ call->completion = compl;
call->state = RXRPC_CALL_COMPLETE;
trace_rxrpc_call_complete(call);
wake_up(&call->waitq);
diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c
index 1221b0637a7e..4e565eeab426 100644
--- a/net/rxrpc/rtt.c
+++ b/net/rxrpc/rtt.c
@@ -14,7 +14,6 @@
#define RXRPC_RTO_MAX ((unsigned)(120 * HZ))
#define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */
#define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */
-#define rxrpc_min_rtt_wlen 300 /* As sysctl_tcp_min_rtt_wlen */
static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer)
{
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index e08130e5746b..e2e9e9b0a6d7 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -15,6 +15,7 @@
#include <linux/scatterlist.h>
#include <linux/ctype.h>
#include <linux/slab.h>
+#include <linux/key-type.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <keys/rxrpc-type.h>
@@ -27,6 +28,7 @@
#define INST_SZ 40 /* size of principal's instance */
#define REALM_SZ 40 /* size of principal's auth domain */
#define SNAME_SZ 40 /* size of service name */
+#define RXKAD_ALIGN 8
struct rxkad_level1_hdr {
__be32 data_size; /* true data size (excluding padding) */
@@ -37,6 +39,9 @@ struct rxkad_level2_hdr {
__be32 checksum; /* decrypted data checksum */
};
+static int rxkad_prime_packet_security(struct rxrpc_connection *conn,
+ struct crypto_sync_skcipher *ci);
+
/*
* this holds a pinned cipher so that keventd doesn't get called by the cipher
* alloc routine, but since we have it to hand, we use it to decrypt RESPONSE
@@ -47,17 +52,59 @@ static struct skcipher_request *rxkad_ci_req;
static DEFINE_MUTEX(rxkad_ci_mutex);
/*
+ * Parse the information from a server key
+ *
+ * The data should be the 8-byte secret key.
+ */
+static int rxkad_preparse_server_key(struct key_preparsed_payload *prep)
+{
+ struct crypto_skcipher *ci;
+
+ if (prep->datalen != 8)
+ return -EINVAL;
+
+ memcpy(&prep->payload.data[2], prep->data, 8);
+
+ ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(ci)) {
+ _leave(" = %ld", PTR_ERR(ci));
+ return PTR_ERR(ci);
+ }
+
+ if (crypto_skcipher_setkey(ci, prep->data, 8) < 0)
+ BUG();
+
+ prep->payload.data[0] = ci;
+ _leave(" = 0");
+ return 0;
+}
+
+static void rxkad_free_preparse_server_key(struct key_preparsed_payload *prep)
+{
+
+ if (prep->payload.data[0])
+ crypto_free_skcipher(prep->payload.data[0]);
+}
+
+static void rxkad_destroy_server_key(struct key *key)
+{
+ if (key->payload.data[0]) {
+ crypto_free_skcipher(key->payload.data[0]);
+ key->payload.data[0] = NULL;
+ }
+}
+
+/*
* initialise connection security
*/
-static int rxkad_init_connection_security(struct rxrpc_connection *conn)
+static int rxkad_init_connection_security(struct rxrpc_connection *conn,
+ struct rxrpc_key_token *token)
{
struct crypto_sync_skcipher *ci;
- struct rxrpc_key_token *token;
int ret;
_enter("{%d},{%x}", conn->debug_id, key_serial(conn->params.key));
- token = conn->params.key->payload.data[0];
conn->security_ix = token->security_index;
ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0);
@@ -73,32 +120,68 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
switch (conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
- break;
case RXRPC_SECURITY_AUTH:
- conn->size_align = 8;
- conn->security_size = sizeof(struct rxkad_level1_hdr);
- break;
case RXRPC_SECURITY_ENCRYPT:
- conn->size_align = 8;
- conn->security_size = sizeof(struct rxkad_level2_hdr);
break;
default:
ret = -EKEYREJECTED;
goto error;
}
- conn->cipher = ci;
- ret = 0;
+ ret = rxkad_prime_packet_security(conn, ci);
+ if (ret < 0)
+ goto error_ci;
+
+ conn->rxkad.cipher = ci;
+ return 0;
+
+error_ci:
+ crypto_free_sync_skcipher(ci);
error:
_leave(" = %d", ret);
return ret;
}
/*
+ * Work out how much data we can put in a packet.
+ */
+static int rxkad_how_much_data(struct rxrpc_call *call, size_t remain,
+ size_t *_buf_size, size_t *_data_size, size_t *_offset)
+{
+ size_t shdr, buf_size, chunk;
+
+ switch (call->conn->params.security_level) {
+ default:
+ buf_size = chunk = min_t(size_t, remain, RXRPC_JUMBO_DATALEN);
+ shdr = 0;
+ goto out;
+ case RXRPC_SECURITY_AUTH:
+ shdr = sizeof(struct rxkad_level1_hdr);
+ break;
+ case RXRPC_SECURITY_ENCRYPT:
+ shdr = sizeof(struct rxkad_level2_hdr);
+ break;
+ }
+
+ buf_size = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN);
+
+ chunk = buf_size - shdr;
+ if (remain < chunk)
+ buf_size = round_up(shdr + remain, RXKAD_ALIGN);
+
+out:
+ *_buf_size = buf_size;
+ *_data_size = chunk;
+ *_offset = shdr;
+ return 0;
+}
+
+/*
* prime the encryption state with the invariant parts of a connection's
* description
*/
-static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
+static int rxkad_prime_packet_security(struct rxrpc_connection *conn,
+ struct crypto_sync_skcipher *ci)
{
struct skcipher_request *req;
struct rxrpc_key_token *token;
@@ -116,7 +199,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
if (!tmpbuf)
return -ENOMEM;
- req = skcipher_request_alloc(&conn->cipher->base, GFP_NOFS);
+ req = skcipher_request_alloc(&ci->base, GFP_NOFS);
if (!req) {
kfree(tmpbuf);
return -ENOMEM;
@@ -131,13 +214,13 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
tmpbuf[3] = htonl(conn->security_ix);
sg_init_one(&sg, tmpbuf, tmpsize);
- skcipher_request_set_sync_tfm(req, conn->cipher);
+ skcipher_request_set_sync_tfm(req, ci);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x);
crypto_skcipher_encrypt(req);
skcipher_request_free(req);
- memcpy(&conn->csum_iv, tmpbuf + 2, sizeof(conn->csum_iv));
+ memcpy(&conn->rxkad.csum_iv, tmpbuf + 2, sizeof(conn->rxkad.csum_iv));
kfree(tmpbuf);
_leave(" = 0");
return 0;
@@ -149,7 +232,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
*/
static struct skcipher_request *rxkad_get_call_crypto(struct rxrpc_call *call)
{
- struct crypto_skcipher *tfm = &call->conn->cipher->base;
+ struct crypto_skcipher *tfm = &call->conn->rxkad.cipher->base;
struct skcipher_request *cipher_req = call->cipher_req;
if (!cipher_req) {
@@ -176,15 +259,14 @@ static void rxkad_free_call_crypto(struct rxrpc_call *call)
* partially encrypt a packet (level 1 security)
*/
static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 data_size,
- void *sechdr,
+ struct sk_buff *skb, u32 data_size,
struct skcipher_request *req)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxkad_level1_hdr hdr;
struct rxrpc_crypt iv;
struct scatterlist sg;
+ size_t pad;
u16 check;
_enter("");
@@ -193,13 +275,19 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
data_size |= (u32)check << 16;
hdr.data_size = htonl(data_size);
- memcpy(sechdr, &hdr, sizeof(hdr));
+ memcpy(skb->head, &hdr, sizeof(hdr));
+
+ pad = sizeof(struct rxkad_level1_hdr) + data_size;
+ pad = RXKAD_ALIGN - pad;
+ pad &= RXKAD_ALIGN - 1;
+ if (pad)
+ skb_put_zero(skb, pad);
/* start the encryption afresh */
memset(&iv, 0, sizeof(iv));
- sg_init_one(&sg, sechdr, 8);
- skcipher_request_set_sync_tfm(req, call->conn->cipher);
+ sg_init_one(&sg, skb->head, 8);
+ skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
crypto_skcipher_encrypt(req);
@@ -215,7 +303,6 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
struct sk_buff *skb,
u32 data_size,
- void *sechdr,
struct skcipher_request *req)
{
const struct rxrpc_key_token *token;
@@ -224,6 +311,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
struct rxrpc_crypt iv;
struct scatterlist sg[16];
unsigned int len;
+ size_t pad;
u16 check;
int err;
@@ -235,14 +323,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
rxkhdr.data_size = htonl(data_size | (u32)check << 16);
rxkhdr.checksum = 0;
- memcpy(sechdr, &rxkhdr, sizeof(rxkhdr));
+ memcpy(skb->head, &rxkhdr, sizeof(rxkhdr));
+
+ pad = sizeof(struct rxkad_level2_hdr) + data_size;
+ pad = RXKAD_ALIGN - pad;
+ pad &= RXKAD_ALIGN - 1;
+ if (pad)
+ skb_put_zero(skb, pad);
/* encrypt from the session key */
token = call->conn->params.key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
- sg_init_one(&sg[0], sechdr, sizeof(rxkhdr));
- skcipher_request_set_sync_tfm(req, call->conn->cipher);
+ sg_init_one(&sg[0], skb->head, sizeof(rxkhdr));
+ skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x);
crypto_skcipher_encrypt(req);
@@ -252,11 +346,10 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
if (skb_shinfo(skb)->nr_frags > 16)
goto out;
- len = data_size + call->conn->size_align - 1;
- len &= ~(call->conn->size_align - 1);
+ len = round_up(data_size, RXKAD_ALIGN);
sg_init_table(sg, ARRAY_SIZE(sg));
- err = skb_to_sgvec(skb, sg, 0, len);
+ err = skb_to_sgvec(skb, sg, 8, len);
if (unlikely(err < 0))
goto out;
skcipher_request_set_crypt(req, sg, sg, len, iv.x);
@@ -275,8 +368,7 @@ out:
*/
static int rxkad_secure_packet(struct rxrpc_call *call,
struct sk_buff *skb,
- size_t data_size,
- void *sechdr)
+ size_t data_size)
{
struct rxrpc_skb_priv *sp;
struct skcipher_request *req;
@@ -291,7 +383,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
call->debug_id, key_serial(call->conn->params.key),
sp->hdr.seq, data_size);
- if (!call->conn->cipher)
+ if (!call->conn->rxkad.cipher)
return 0;
ret = key_validate(call->conn->params.key);
@@ -303,7 +395,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
return -ENOMEM;
/* continue encrypting from where we left off */
- memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
+ memcpy(&iv, call->conn->rxkad.csum_iv.x, sizeof(iv));
/* calculate the security checksum */
x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
@@ -312,7 +404,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
call->crypto_buf[1] = htonl(x);
sg_init_one(&sg, call->crypto_buf, 8);
- skcipher_request_set_sync_tfm(req, call->conn->cipher);
+ skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
crypto_skcipher_encrypt(req);
@@ -329,12 +421,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
ret = 0;
break;
case RXRPC_SECURITY_AUTH:
- ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr,
- req);
+ ret = rxkad_secure_packet_auth(call, skb, data_size, req);
break;
case RXRPC_SECURITY_ENCRYPT:
- ret = rxkad_secure_packet_encrypt(call, skb, data_size,
- sechdr, req);
+ ret = rxkad_secure_packet_encrypt(call, skb, data_size, req);
break;
default:
ret = -EPERM;
@@ -380,7 +470,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
/* start the decryption afresh */
memset(&iv, 0, sizeof(iv));
- skcipher_request_set_sync_tfm(req, call->conn->cipher);
+ skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, 8, iv.x);
crypto_skcipher_decrypt(req);
@@ -472,7 +562,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
token = call->conn->params.key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
- skcipher_request_set_sync_tfm(req, call->conn->cipher);
+ skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, len, iv.x);
crypto_skcipher_decrypt(req);
@@ -538,7 +628,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
_enter("{%d{%x}},{#%u}",
call->debug_id, key_serial(call->conn->params.key), seq);
- if (!call->conn->cipher)
+ if (!call->conn->rxkad.cipher)
return 0;
req = rxkad_get_call_crypto(call);
@@ -546,7 +636,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
return -ENOMEM;
/* continue encrypting from where we left off */
- memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
+ memcpy(&iv, call->conn->rxkad.csum_iv.x, sizeof(iv));
/* validate the security checksum */
x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
@@ -555,7 +645,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
call->crypto_buf[1] = htonl(x);
sg_init_one(&sg, call->crypto_buf, 8);
- skcipher_request_set_sync_tfm(req, call->conn->cipher);
+ skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
crypto_skcipher_encrypt(req);
@@ -648,16 +738,12 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
u32 serial;
int ret;
- _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
+ _enter("{%d}", conn->debug_id);
- ret = key_validate(conn->server_key);
- if (ret < 0)
- return ret;
-
- get_random_bytes(&conn->security_nonce, sizeof(conn->security_nonce));
+ get_random_bytes(&conn->rxkad.nonce, sizeof(conn->rxkad.nonce));
challenge.version = htonl(2);
- challenge.nonce = htonl(conn->security_nonce);
+ challenge.nonce = htonl(conn->rxkad.nonce);
challenge.min_level = htonl(0);
challenge.__padding = 0;
@@ -785,7 +871,7 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn,
struct rxrpc_crypt iv;
struct scatterlist sg[1];
- req = skcipher_request_alloc(&conn->cipher->base, GFP_NOFS);
+ req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS);
if (!req)
return -ENOMEM;
@@ -794,7 +880,7 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn,
sg_init_table(sg, 1);
sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted));
- skcipher_request_set_sync_tfm(req, conn->cipher);
+ skcipher_request_set_sync_tfm(req, conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
crypto_skcipher_encrypt(req);
@@ -892,6 +978,7 @@ other_error:
* decrypt the kerberos IV ticket in the response
*/
static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
+ struct key *server_key,
struct sk_buff *skb,
void *ticket, size_t ticket_len,
struct rxrpc_crypt *_session_key,
@@ -911,30 +998,17 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
u32 abort_code;
u8 *p, *q, *name, *end;
- _enter("{%d},{%x}", conn->debug_id, key_serial(conn->server_key));
+ _enter("{%d},{%x}", conn->debug_id, key_serial(server_key));
*_expiry = 0;
- ret = key_validate(conn->server_key);
- if (ret < 0) {
- switch (ret) {
- case -EKEYEXPIRED:
- abort_code = RXKADEXPIRED;
- goto other_error;
- default:
- abort_code = RXKADNOAUTH;
- goto other_error;
- }
- }
-
- ASSERT(conn->server_key->payload.data[0] != NULL);
+ ASSERT(server_key->payload.data[0] != NULL);
ASSERTCMP((unsigned long) ticket & 7UL, ==, 0);
- memcpy(&iv, &conn->server_key->payload.data[2], sizeof(iv));
+ memcpy(&iv, &server_key->payload.data[2], sizeof(iv));
ret = -ENOMEM;
- req = skcipher_request_alloc(conn->server_key->payload.data[0],
- GFP_NOFS);
+ req = skcipher_request_alloc(server_key->payload.data[0], GFP_NOFS);
if (!req)
goto temporary_error;
@@ -1090,6 +1164,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
struct rxkad_response *response;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_crypt session_key;
+ struct key *server_key;
const char *eproto;
time64_t expiry;
void *ticket;
@@ -1097,7 +1172,27 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
__be32 csum;
int ret, i;
- _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
+ _enter("{%d}", conn->debug_id);
+
+ server_key = rxrpc_look_up_server_security(conn, skb, 0, 0);
+ if (IS_ERR(server_key)) {
+ switch (PTR_ERR(server_key)) {
+ case -ENOKEY:
+ abort_code = RXKADUNKNOWNKEY;
+ break;
+ case -EKEYEXPIRED:
+ abort_code = RXKADEXPIRED;
+ break;
+ default:
+ abort_code = RXKADNOAUTH;
+ break;
+ }
+ trace_rxrpc_abort(0, "SVK",
+ sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ abort_code, PTR_ERR(server_key));
+ *_abort_code = abort_code;
+ return -EPROTO;
+ }
ret = -ENOMEM;
response = kzalloc(sizeof(struct rxkad_response), GFP_NOFS);
@@ -1109,8 +1204,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
response, sizeof(*response)) < 0)
goto protocol_error;
- if (!pskb_pull(skb, sizeof(*response)))
- BUG();
version = ntohl(response->version);
ticket_len = ntohl(response->ticket_len);
@@ -1141,12 +1234,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
eproto = tracepoint_string("rxkad_tkt_short");
abort_code = RXKADPACKETSHORT;
- if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response),
ticket, ticket_len) < 0)
goto protocol_error_free;
- ret = rxkad_decrypt_ticket(conn, skb, ticket, ticket_len, &session_key,
- &expiry, _abort_code);
+ ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len,
+ &session_key, &expiry, _abort_code);
if (ret < 0)
goto temporary_error_free_ticket;
@@ -1169,7 +1262,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
if (response->encrypted.checksum != csum)
goto protocol_error_free;
- spin_lock(&conn->channel_lock);
+ spin_lock(&conn->bundle->channel_lock);
for (i = 0; i < RXRPC_MAXCALLS; i++) {
struct rxrpc_call *call;
u32 call_id = ntohl(response->encrypted.call_id[i]);
@@ -1186,17 +1279,17 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
if (call_id > conn->channels[i].call_counter) {
call = rcu_dereference_protected(
conn->channels[i].call,
- lockdep_is_held(&conn->channel_lock));
+ lockdep_is_held(&conn->bundle->channel_lock));
if (call && call->state < RXRPC_CALL_COMPLETE)
goto protocol_error_unlock;
conn->channels[i].call_counter = call_id;
}
}
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&conn->bundle->channel_lock);
eproto = tracepoint_string("rxkad_rsp_seq");
abort_code = RXKADOUTOFSEQUENCE;
- if (ntohl(response->encrypted.inc_nonce) != conn->security_nonce + 1)
+ if (ntohl(response->encrypted.inc_nonce) != conn->rxkad.nonce + 1)
goto protocol_error_free;
eproto = tracepoint_string("rxkad_rsp_level");
@@ -1219,12 +1312,13 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
return 0;
protocol_error_unlock:
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&conn->bundle->channel_lock);
protocol_error_free:
kfree(ticket);
protocol_error:
kfree(response);
trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto);
+ key_put(server_key);
*_abort_code = abort_code;
return -EPROTO;
@@ -1237,6 +1331,7 @@ temporary_error:
* ENOMEM. We just want to send the challenge again. Note that we
* also come out this way if the ticket decryption fails.
*/
+ key_put(server_key);
return ret;
}
@@ -1247,8 +1342,8 @@ static void rxkad_clear(struct rxrpc_connection *conn)
{
_enter("");
- if (conn->cipher)
- crypto_free_sync_skcipher(conn->cipher);
+ if (conn->rxkad.cipher)
+ crypto_free_sync_skcipher(conn->rxkad.cipher);
}
/*
@@ -1296,8 +1391,11 @@ const struct rxrpc_security rxkad = {
.no_key_abort = RXKADUNKNOWNKEY,
.init = rxkad_init,
.exit = rxkad_exit,
+ .preparse_server_key = rxkad_preparse_server_key,
+ .free_preparse_server_key = rxkad_free_preparse_server_key,
+ .destroy_server_key = rxkad_destroy_server_key,
.init_connection_security = rxkad_init_connection_security,
- .prime_packet_security = rxkad_prime_packet_security,
+ .how_much_data = rxkad_how_much_data,
.secure_packet = rxkad_secure_packet,
.verify_packet = rxkad_verify_packet,
.free_call_crypto = rxkad_free_call_crypto,
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index 9b1fb9ed0717..50cb5f1ee0c0 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -55,7 +55,7 @@ void rxrpc_exit_security(void)
/*
* look up an rxrpc security module
*/
-static const struct rxrpc_security *rxrpc_security_lookup(u8 security_index)
+const struct rxrpc_security *rxrpc_security_lookup(u8 security_index)
{
if (security_index >= ARRAY_SIZE(rxrpc_security_types))
return NULL;
@@ -81,16 +81,17 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
if (ret < 0)
return ret;
- token = key->payload.data[0];
- if (!token)
- return -EKEYREJECTED;
+ for (token = key->payload.data[0]; token; token = token->next) {
+ sec = rxrpc_security_lookup(token->security_index);
+ if (sec)
+ goto found;
+ }
+ return -EKEYREJECTED;
- sec = rxrpc_security_lookup(token->security_index);
- if (!sec)
- return -EKEYREJECTED;
+found:
conn->security = sec;
- ret = conn->security->init_connection_security(conn);
+ ret = conn->security->init_connection_security(conn, token);
if (ret < 0) {
conn->security = &rxrpc_no_security;
return ret;
@@ -101,22 +102,16 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
}
/*
- * Find the security key for a server connection.
+ * Set the ops a server connection.
*/
-bool rxrpc_look_up_server_security(struct rxrpc_local *local, struct rxrpc_sock *rx,
- const struct rxrpc_security **_sec,
- struct key **_key,
- struct sk_buff *skb)
+const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *rx,
+ struct sk_buff *skb)
{
const struct rxrpc_security *sec;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- key_ref_t kref = NULL;
- char kdesc[5 + 1 + 3 + 1];
_enter("");
- sprintf(kdesc, "%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex);
-
sec = rxrpc_security_lookup(sp->hdr.securityIndex);
if (!sec) {
trace_rxrpc_abort(0, "SVS",
@@ -124,35 +119,72 @@ bool rxrpc_look_up_server_security(struct rxrpc_local *local, struct rxrpc_sock
RX_INVALID_OPERATION, EKEYREJECTED);
skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
skb->priority = RX_INVALID_OPERATION;
- return false;
+ return NULL;
}
- if (sp->hdr.securityIndex == RXRPC_SECURITY_NONE)
- goto out;
-
- if (!rx->securities) {
+ if (sp->hdr.securityIndex != RXRPC_SECURITY_NONE &&
+ !rx->securities) {
trace_rxrpc_abort(0, "SVR",
sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
RX_INVALID_OPERATION, EKEYREJECTED);
skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
- skb->priority = RX_INVALID_OPERATION;
- return false;
+ skb->priority = sec->no_key_abort;
+ return NULL;
}
+ return sec;
+}
+
+/*
+ * Find the security key for a server connection.
+ */
+struct key *rxrpc_look_up_server_security(struct rxrpc_connection *conn,
+ struct sk_buff *skb,
+ u32 kvno, u32 enctype)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_sock *rx;
+ struct key *key = ERR_PTR(-EKEYREJECTED);
+ key_ref_t kref = NULL;
+ char kdesc[5 + 1 + 3 + 1 + 12 + 1 + 12 + 1];
+ int ret;
+
+ _enter("");
+
+ if (enctype)
+ sprintf(kdesc, "%u:%u:%u:%u",
+ sp->hdr.serviceId, sp->hdr.securityIndex, kvno, enctype);
+ else if (kvno)
+ sprintf(kdesc, "%u:%u:%u",
+ sp->hdr.serviceId, sp->hdr.securityIndex, kvno);
+ else
+ sprintf(kdesc, "%u:%u",
+ sp->hdr.serviceId, sp->hdr.securityIndex);
+
+ rcu_read_lock();
+
+ rx = rcu_dereference(conn->params.local->service);
+ if (!rx)
+ goto out;
+
/* look through the service's keyring */
kref = keyring_search(make_key_ref(rx->securities, 1UL),
&key_type_rxrpc_s, kdesc, true);
if (IS_ERR(kref)) {
- trace_rxrpc_abort(0, "SVK",
- sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
- sec->no_key_abort, EKEYREJECTED);
- skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
- skb->priority = sec->no_key_abort;
- return false;
+ key = ERR_CAST(kref);
+ goto out;
+ }
+
+ key = key_ref_to_ptr(kref);
+
+ ret = key_validate(key);
+ if (ret < 0) {
+ key_put(key);
+ key = ERR_PTR(ret);
+ goto out;
}
out:
- *_sec = sec;
- *_key = key_ref_to_ptr(kref);
- return true;
+ rcu_read_unlock();
+ return key;
}
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index d27140c836cc..af8ad6c30b9f 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -327,7 +327,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
rxrpc_send_ack_packet(call, false, NULL);
if (!skb) {
- size_t size, chunk, max, space;
+ size_t remain, bufsize, chunk, offset;
_debug("alloc");
@@ -342,24 +342,21 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
goto maybe_error;
}
- max = RXRPC_JUMBO_DATALEN;
- max -= call->conn->security_size;
- max &= ~(call->conn->size_align - 1UL);
-
- chunk = max;
- if (chunk > msg_data_left(msg) && !more)
- chunk = msg_data_left(msg);
-
- space = chunk + call->conn->size_align;
- space &= ~(call->conn->size_align - 1UL);
-
- size = space + call->conn->security_size;
+ /* Work out the maximum size of a packet. Assume that
+ * the security header is going to be in the padded
+ * region (enc blocksize), but the trailer is not.
+ */
+ remain = more ? INT_MAX : msg_data_left(msg);
+ ret = call->conn->security->how_much_data(call, remain,
+ &bufsize, &chunk, &offset);
+ if (ret < 0)
+ goto maybe_error;
- _debug("SIZE: %zu/%zu/%zu", chunk, space, size);
+ _debug("SIZE: %zu/%zu @%zu", chunk, bufsize, offset);
/* create a buffer that we can retain until it's ACK'd */
skb = sock_alloc_send_skb(
- sk, size, msg->msg_flags & MSG_DONTWAIT, &ret);
+ sk, bufsize, msg->msg_flags & MSG_DONTWAIT, &ret);
if (!skb)
goto maybe_error;
@@ -371,9 +368,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
ASSERTCMP(skb->mark, ==, 0);
- _debug("HS: %u", call->conn->security_size);
- skb_reserve(skb, call->conn->security_size);
- skb->len += call->conn->security_size;
+ __skb_put(skb, offset);
sp->remain = chunk;
if (sp->remain > skb_tailroom(skb))
@@ -422,17 +417,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
(msg_data_left(msg) == 0 && !more)) {
struct rxrpc_connection *conn = call->conn;
uint32_t seq;
- size_t pad;
-
- /* pad out if we're using security */
- if (conn->security_ix) {
- pad = conn->security_size + skb->mark;
- pad = conn->size_align - pad;
- pad &= conn->size_align - 1;
- _debug("pad %zu", pad);
- if (pad)
- skb_put_zero(skb, pad);
- }
seq = call->tx_top + 1;
@@ -446,8 +430,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
call->tx_winsize)
sp->hdr.flags |= RXRPC_MORE_PACKETS;
- ret = call->security->secure_packet(
- call, skb, skb->mark, skb->head);
+ ret = call->security->secure_packet(call, skb, skb->mark);
if (ret < 0)
goto out;
diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c
new file mode 100644
index 000000000000..ead3471307ee
--- /dev/null
+++ b/net/rxrpc/server_key.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* RxRPC key management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * RxRPC keys should have a description of describing their purpose:
+ * "afs@CAMBRIDGE.REDHAT.COM>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/skcipher.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/key-type.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <keys/rxrpc-type.h>
+#include <keys/user-type.h>
+#include "ar-internal.h"
+
+static int rxrpc_vet_description_s(const char *);
+static int rxrpc_preparse_s(struct key_preparsed_payload *);
+static void rxrpc_free_preparse_s(struct key_preparsed_payload *);
+static void rxrpc_destroy_s(struct key *);
+static void rxrpc_describe_s(const struct key *, struct seq_file *);
+
+/*
+ * rxrpc server keys take "<serviceId>:<securityIndex>[:<sec-specific>]" as the
+ * description and the key material as the payload.
+ */
+struct key_type key_type_rxrpc_s = {
+ .name = "rxrpc_s",
+ .flags = KEY_TYPE_NET_DOMAIN,
+ .vet_description = rxrpc_vet_description_s,
+ .preparse = rxrpc_preparse_s,
+ .free_preparse = rxrpc_free_preparse_s,
+ .instantiate = generic_key_instantiate,
+ .destroy = rxrpc_destroy_s,
+ .describe = rxrpc_describe_s,
+};
+
+/*
+ * Vet the description for an RxRPC server key.
+ */
+static int rxrpc_vet_description_s(const char *desc)
+{
+ unsigned long service, sec_class;
+ char *p;
+
+ service = simple_strtoul(desc, &p, 10);
+ if (*p != ':' || service > 65535)
+ return -EINVAL;
+ sec_class = simple_strtoul(p + 1, &p, 10);
+ if ((*p && *p != ':') || sec_class < 1 || sec_class > 255)
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Preparse a server secret key.
+ */
+static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
+{
+ const struct rxrpc_security *sec;
+ unsigned int service, sec_class;
+ int n;
+
+ _enter("%zu", prep->datalen);
+
+ if (!prep->orig_description)
+ return -EINVAL;
+
+ if (sscanf(prep->orig_description, "%u:%u%n", &service, &sec_class, &n) != 2)
+ return -EINVAL;
+
+ sec = rxrpc_security_lookup(sec_class);
+ if (!sec)
+ return -ENOPKG;
+
+ prep->payload.data[1] = (struct rxrpc_security *)sec;
+
+ return sec->preparse_server_key(prep);
+}
+
+static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep)
+{
+ const struct rxrpc_security *sec = prep->payload.data[1];
+
+ if (sec)
+ sec->free_preparse_server_key(prep);
+}
+
+static void rxrpc_destroy_s(struct key *key)
+{
+ const struct rxrpc_security *sec = key->payload.data[1];
+
+ if (sec)
+ sec->destroy_server_key(key);
+}
+
+static void rxrpc_describe_s(const struct key *key, struct seq_file *m)
+{
+ const struct rxrpc_security *sec = key->payload.data[1];
+
+ seq_puts(m, key->description);
+ if (sec && sec->describe_server_key)
+ sec->describe_server_key(key, m);
+}
+
+/*
+ * grab the security keyring for a server socket
+ */
+int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen)
+{
+ struct key *key;
+ char *description;
+
+ _enter("");
+
+ if (optlen <= 0 || optlen > PAGE_SIZE - 1)
+ return -EINVAL;
+
+ description = memdup_sockptr_nul(optval, optlen);
+ if (IS_ERR(description))
+ return PTR_ERR(description);
+
+ key = request_key(&key_type_keyring, description, NULL);
+ if (IS_ERR(key)) {
+ kfree(description);
+ _leave(" = %ld", PTR_ERR(key));
+ return PTR_ERR(key);
+ }
+
+ rx->securities = key;
+ kfree(description);
+ _leave(" = 0 [key %x]", key->serial);
+ return 0;
+}
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index e91acc95ff28..540351d6a5f4 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -74,21 +74,13 @@ static struct ctl_table rxrpc_sysctl_table[] = {
/* Non-time values */
{
- .procname = "max_client_conns",
- .data = &rxrpc_max_client_connections,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)&rxrpc_reap_client_connections,
- },
- {
.procname = "reap_client_conns",
.data = &rxrpc_reap_client_connections,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)SYSCTL_ONE,
- .extra2 = (void *)&rxrpc_max_client_connections,
+ .extra2 = (void *)&n_65535,
},
{
.procname = "max_backlog",
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a3b37d88800e..1e8ab4749c6c 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -281,7 +281,7 @@ config NET_SCH_CHOKE
help
Say Y here if you want to use the CHOKe packet scheduler (CHOose
and Keep for responsive flows, CHOose and Kill for unresponsive
- flows). This is a variation of RED which trys to penalize flows
+ flows). This is a variation of RED which tries to penalize flows
that monopolize the queue.
To compile this code as a module, choose M here: the
@@ -813,7 +813,7 @@ config NET_ACT_SAMPLE
config NET_ACT_IPT
tristate "IPtables targets"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NET_CLS_ACT && NETFILTER && NETFILTER_XTABLES
help
Say Y here to be able to invoke iptables targets after successful
classification.
@@ -912,7 +912,7 @@ config NET_ACT_BPF
config NET_ACT_CONNMARK
tristate "Netfilter Connection Mark Retriever"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NET_CLS_ACT && NETFILTER
depends on NF_CONNTRACK && NF_CONNTRACK_MARK
help
Say Y here to allow retrieving of conn mark
@@ -924,7 +924,7 @@ config NET_ACT_CONNMARK
config NET_ACT_CTINFO
tristate "Netfilter Connection Mark Actions"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NET_CLS_ACT && NETFILTER
depends on NF_CONNTRACK && NF_CONNTRACK_MARK
help
Say Y here to allow transfer of a connmark stored information.
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 66bbf9a98f9e..dd14ef413fda 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -5,6 +5,7 @@
obj-y := sch_generic.o sch_mq.o
+obj-$(CONFIG_INET) += sch_frag.o
obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o
obj-$(CONFIG_NET_CLS) += cls_api.o
obj-$(CONFIG_NET_CLS_ACT) += act_api.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 798430e1a79f..2e85b636b27b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -22,6 +22,22 @@
#include <net/act_api.h>
#include <net/netlink.h>
+#ifdef CONFIG_INET
+DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count);
+EXPORT_SYMBOL_GPL(tcf_frag_xmit_count);
+#endif
+
+int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb))
+{
+#ifdef CONFIG_INET
+ if (static_branch_unlikely(&tcf_frag_xmit_count))
+ return sch_frag_xmit_hook(skb, xmit);
+#endif
+
+ return xmit(skb);
+}
+EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit);
+
static void tcf_action_goto_chain_exec(const struct tc_action *a,
struct tcf_result *res)
{
@@ -215,6 +231,36 @@ static size_t tcf_action_fill_size(const struct tc_action *act)
return sz;
}
+static int
+tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a, bool from_act)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_cookie *cookie;
+
+ if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+ goto nla_put_failure;
+ if (tcf_action_copy_stats(skb, a, 0))
+ goto nla_put_failure;
+ if (from_act && nla_put_u32(skb, TCA_ACT_INDEX, a->tcfa_index))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ cookie = rcu_dereference(a->act_cookie);
+ if (cookie) {
+ if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+ }
+ rcu_read_unlock();
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
struct netlink_callback *cb)
{
@@ -248,7 +294,9 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
index--;
goto nla_put_failure;
}
- err = tcf_action_dump_1(skb, p, 0, 0);
+ err = (act_flags & TCA_ACT_FLAG_TERSE_DUMP) ?
+ tcf_action_dump_terse(skb, p, true) :
+ tcf_action_dump_1(skb, p, 0, 0);
if (err < 0) {
index--;
nlmsg_trim(skb, nest);
@@ -256,7 +304,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
}
nla_nest_end(skb, nest);
n_i++;
- if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
+ if (!(act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) &&
n_i >= TCA_ACT_MAX_PRIO)
goto done;
}
@@ -266,7 +314,7 @@ done:
mutex_unlock(&idrinfo->lock);
if (n_i) {
- if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
+ if (act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON)
cb->args[1] = n_i;
}
return n_i;
@@ -651,7 +699,7 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
return res;
}
-/*TCA_ACT_MAX_PRIO is 32, there count upto 32 */
+/*TCA_ACT_MAX_PRIO is 32, there count up to 32 */
#define TCA_ACT_MAX_PRIO_MASK 0x1FF
int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
int nr_actions, struct tcf_result *res)
@@ -752,34 +800,6 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
return a->ops->dump(skb, a, bind, ref);
}
-static int
-tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a)
-{
- unsigned char *b = skb_tail_pointer(skb);
- struct tc_cookie *cookie;
-
- if (nla_put_string(skb, TCA_KIND, a->ops->kind))
- goto nla_put_failure;
- if (tcf_action_copy_stats(skb, a, 0))
- goto nla_put_failure;
-
- rcu_read_lock();
- cookie = rcu_dereference(a->act_cookie);
- if (cookie) {
- if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
- rcu_read_unlock();
- goto nla_put_failure;
- }
- }
- rcu_read_unlock();
-
- return 0;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
int
tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
@@ -787,7 +807,7 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
- if (tcf_action_dump_terse(skb, a))
+ if (tcf_action_dump_terse(skb, a, false))
goto nla_put_failure;
if (a->hw_stats != TCA_ACT_HW_STATS_ANY &&
@@ -832,7 +852,7 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
nest = nla_nest_start_noflag(skb, i + 1);
if (nest == NULL)
goto nla_put_failure;
- err = terse ? tcf_action_dump_terse(skb, a) :
+ err = terse ? tcf_action_dump_terse(skb, a, false) :
tcf_action_dump_1(skb, a, bind, ref);
if (err < 0)
goto errout;
@@ -935,7 +955,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
NL_SET_ERR_MSG(extack, "TC action kind must be specified");
goto err_out;
}
- if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) {
+ if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) {
NL_SET_ERR_MSG(extack, "TC action name too long");
goto err_out;
}
@@ -982,7 +1002,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
#endif
NL_SET_ERR_MSG(extack, "Failed to load TC action module");
err = -ENOENT;
- goto err_out;
+ goto err_free;
}
/* backward compatibility for policer */
@@ -1012,11 +1032,12 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
err_mod:
module_put(a_o->owner);
-err_out:
+err_free:
if (cookie) {
kfree(cookie->data);
kfree(cookie);
}
+err_out:
return ERR_PTR(err);
}
@@ -1468,7 +1489,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
}
static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
- [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_FLAG_LARGE_DUMP_ON),
+ [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAG_LARGE_DUMP_ON |
+ TCA_ACT_FLAG_TERSE_DUMP),
[TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 },
};
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index a4c7ba35a343..e48e980c3b93 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -65,7 +65,7 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
* In case a different well-known TC_ACT opcode has been
* returned, it will overwrite the default one.
*
- * For everything else that is unkown, TC_ACT_UNSPEC is
+ * For everything else that is unknown, TC_ACT_UNSPEC is
* returned.
*/
switch (filter_res) {
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index a780afdf570d..83a5c6722a06 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -156,11 +156,11 @@ tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
__be16 target_dst = target.dst.u.udp.port;
if (target_src != tuple->src.u.udp.port)
- tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
offsetof(struct udphdr, source),
0xFFFF, be16_to_cpu(target_src));
if (target_dst != tuple->dst.u.udp.port)
- tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
offsetof(struct udphdr, dest),
0xFFFF, be16_to_cpu(target_dst));
}
@@ -296,7 +296,8 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
goto err_insert;
ct_ft->nf_ft.type = &flowtable_ct;
- ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD;
+ ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD |
+ NF_FLOWTABLE_COUNTER;
err = nf_flow_table_init(&ct_ft->nf_ft);
if (err)
goto err_init;
@@ -540,7 +541,8 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
flow_offload_refresh(nf_ft, flow);
nf_conntrack_get(&ct->ct_general);
nf_ct_set(skb, ct, ctinfo);
- nf_ct_acct_update(ct, dir, skb->len);
+ if (nf_ft->flags & NF_FLOWTABLE_COUNTER)
+ nf_ct_acct_update(ct, dir, skb->len);
return true;
}
@@ -1039,7 +1041,7 @@ drop:
static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
[TCA_CT_ACTION] = { .type = NLA_U16 },
- [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) },
+ [TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)),
[TCA_CT_ZONE] = { .type = NLA_U16 },
[TCA_CT_MARK] = { .type = NLA_U32 },
[TCA_CT_MARK_MASK] = { .type = NLA_U32 },
@@ -1049,10 +1051,8 @@ static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
.len = 128 / BITS_PER_BYTE },
[TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
[TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
- [TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct in6_addr) },
- [TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct in6_addr) },
+ [TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
[TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
[TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
};
@@ -1543,6 +1543,8 @@ static int __init ct_init_module(void)
if (err)
goto err_register;
+ static_branch_inc(&tcf_frag_xmit_count);
+
return 0;
err_register:
@@ -1554,6 +1556,7 @@ err_tbl_init:
static void __exit ct_cleanup_module(void)
{
+ static_branch_dec(&tcf_frag_xmit_count);
tcf_unregister_action(&act_ct_ops, &ct_net_ops);
tcf_ct_flow_tables_uninit();
destroy_workqueue(act_ct_wq);
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 6084300e51ad..b20c8ce59905 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -144,9 +144,8 @@ out:
}
static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
- [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct
- tc_ctinfo) },
+ [TCA_CTINFO_ACT] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct tc_ctinfo)),
[TCA_CTINFO_ZONE] = { .type = NLA_U16 },
[TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 },
[TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
index 7c0771dd77a3..a78cb7965718 100644
--- a/net/sched/act_gate.c
+++ b/net/sched/act_gate.c
@@ -159,8 +159,8 @@ static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = {
};
static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = {
- [TCA_GATE_PARMS] = { .len = sizeof(struct tc_gate),
- .type = NLA_EXACT_LEN },
+ [TCA_GATE_PARMS] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct tc_gate)),
[TCA_GATE_PRIORITY] = { .type = NLA_S32 },
[TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED },
[TCA_GATE_BASE_TIME] = { .type = NLA_U64 },
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 8dc3bec0d325..ac7297f42355 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -166,7 +166,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
if (unlikely(!tname))
goto err1;
if (tb[TCA_IPT_TABLE] == NULL ||
- nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
+ nla_strscpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
strcpy(tname, "mangle");
t = kmemdup(td, td->u.target_size, GFP_KERNEL);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index e24b7e2331cd..7153c67f641e 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -205,6 +205,18 @@ release_idr:
return err;
}
+static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb)
+{
+ int err;
+
+ if (!want_ingress)
+ err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
+ else
+ err = netif_receive_skb(skb);
+
+ return err;
+}
+
static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
@@ -287,18 +299,15 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
/* let's the caller reinsert the packet, if possible */
if (use_reinsert) {
res->ingress = want_ingress;
- if (skb_tc_reinsert(skb, res))
+ err = tcf_mirred_forward(res->ingress, skb);
+ if (err)
tcf_action_inc_overlimit_qstats(&m->common);
__this_cpu_dec(mirred_rec_level);
return TC_ACT_CONSUMED;
}
}
- if (!want_ingress)
- err = dev_queue_xmit(skb2);
- else
- err = netif_receive_skb(skb2);
-
+ err = tcf_mirred_forward(want_ingress, skb2);
if (err) {
out:
tcf_action_inc_overlimit_qstats(&m->common);
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index e298ec3b3c9e..d1486ea496a2 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -87,7 +87,27 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
skb->dev && skb->dev->type == ARPHRD_ETHER))
goto drop;
break;
+ case TCA_MPLS_ACT_MAC_PUSH:
+ if (skb_vlan_tag_present(skb)) {
+ if (__vlan_insert_inner_tag(skb, skb->vlan_proto,
+ skb_vlan_tag_get(skb),
+ ETH_HLEN) < 0)
+ goto drop;
+
+ skb->protocol = skb->vlan_proto;
+ __vlan_hwaccel_clear_tag(skb);
+ }
+
+ new_lse = tcf_mpls_get_lse(NULL, p, mac_len ||
+ !eth_p_mpls(skb->protocol));
+
+ if (skb_mpls_push(skb, new_lse, p->tcfm_proto, 0, false))
+ goto drop;
+ break;
case TCA_MPLS_ACT_MODIFY:
+ if (!pskb_may_pull(skb,
+ skb_network_offset(skb) + MPLS_HLEN))
+ goto drop;
new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false);
if (skb_mpls_update_lse(skb, new_lse))
goto drop;
@@ -188,6 +208,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
}
break;
case TCA_MPLS_ACT_PUSH:
+ case TCA_MPLS_ACT_MAC_PUSH:
if (!tb[TCA_MPLS_LABEL]) {
NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push");
return -EINVAL;
@@ -408,6 +429,7 @@ static void __exit mpls_cleanup_module(void)
module_init(mpls_init_module);
module_exit(mpls_cleanup_module);
+MODULE_SOFTDEP("post: mpls_gso");
MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MPLS manipulation actions");
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index a4f3d0f0daa9..726cc956d06f 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -52,7 +52,7 @@ static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata)
d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
if (unlikely(!d->tcfd_defdata))
return -ENOMEM;
- nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+ nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
return 0;
}
@@ -71,7 +71,7 @@ static int reset_policy(struct tc_action *a, const struct nlattr *defdata,
spin_lock_bh(&d->tcf_lock);
goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch);
memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
- nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+ nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
spin_unlock_bh(&d->tcf_lock);
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index a229751ee8c4..85c0d0d5b9da 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -459,7 +459,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port,
0, flags,
- key_id, 0);
+ key_id, opts_len);
} else {
NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst");
ret = -EINVAL;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 163b0385fd4c..1cac3c6fbb49 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -77,6 +77,16 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
/* put updated tci as hwaccel tag */
__vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci);
break;
+ case TCA_VLAN_ACT_POP_ETH:
+ err = skb_eth_pop(skb);
+ if (err)
+ goto drop;
+ break;
+ case TCA_VLAN_ACT_PUSH_ETH:
+ err = skb_eth_push(skb, p->tcfv_push_dst, p->tcfv_push_src);
+ if (err)
+ goto drop;
+ break;
default:
BUG();
}
@@ -93,10 +103,13 @@ drop:
}
static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
+ [TCA_VLAN_UNSPEC] = { .strict_start_type = TCA_VLAN_PUSH_ETH_DST },
[TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) },
[TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 },
[TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 },
[TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 },
+ [TCA_VLAN_PUSH_ETH_DST] = NLA_POLICY_ETH_ADDR,
+ [TCA_VLAN_PUSH_ETH_SRC] = NLA_POLICY_ETH_ADDR,
};
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
@@ -179,6 +192,17 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY])
push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]);
break;
+ case TCA_VLAN_ACT_POP_ETH:
+ break;
+ case TCA_VLAN_ACT_PUSH_ETH:
+ if (!tb[TCA_VLAN_PUSH_ETH_DST] || !tb[TCA_VLAN_PUSH_ETH_SRC]) {
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EINVAL;
+ }
+ break;
default:
if (exists)
tcf_idr_release(*a, bind);
@@ -219,6 +243,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
p->tcfv_push_prio = push_prio;
p->tcfv_push_proto = push_proto;
+ if (action == TCA_VLAN_ACT_PUSH_ETH) {
+ nla_memcpy(&p->tcfv_push_dst, tb[TCA_VLAN_PUSH_ETH_DST],
+ ETH_ALEN);
+ nla_memcpy(&p->tcfv_push_src, tb[TCA_VLAN_PUSH_ETH_SRC],
+ ETH_ALEN);
+ }
+
spin_lock_bh(&v->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
@@ -277,6 +308,15 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
p->tcfv_push_prio))))
goto nla_put_failure;
+ if (p->tcfv_action == TCA_VLAN_ACT_PUSH_ETH) {
+ if (nla_put(skb, TCA_VLAN_PUSH_ETH_DST, ETH_ALEN,
+ p->tcfv_push_dst))
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_VLAN_PUSH_ETH_SRC, ETH_ALEN,
+ p->tcfv_push_src))
+ goto nla_put_failure;
+ }
+
tcf_tm_dump(&t, &v->tcf_tm);
if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 41a55c6cbeb8..37b77bd30974 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -223,7 +223,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
static bool tcf_proto_check_kind(struct nlattr *kind, char *name)
{
if (kind)
- return nla_strlcpy(name, kind, IFNAMSIZ) >= IFNAMSIZ;
+ return nla_strscpy(name, kind, IFNAMSIZ) < 0;
memset(name, 0, IFNAMSIZ);
return false;
}
@@ -652,12 +652,12 @@ static void tc_block_indr_cleanup(struct flow_block_cb *block_cb)
block_cb->indr.binder_type,
&block->flow_block, tcf_block_shared(block),
&extack);
+ rtnl_lock();
down_write(&block->cb_lock);
list_del(&block_cb->driver_list);
list_move(&block_cb->list, &bo.cb_list);
- up_write(&block->cb_lock);
- rtnl_lock();
tcf_block_unbind(block, &bo);
+ up_write(&block->cb_lock);
rtnl_unlock();
}
@@ -991,13 +991,12 @@ __tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp)
*/
struct tcf_proto *
-tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp,
- bool rtnl_held)
+tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp)
{
struct tcf_proto *tp_next = __tcf_get_next_proto(chain, tp);
if (tp)
- tcf_proto_put(tp, rtnl_held, NULL);
+ tcf_proto_put(tp, true, NULL);
return tp_next;
}
@@ -1924,15 +1923,14 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
struct tcf_block *block, struct Qdisc *q,
u32 parent, struct nlmsghdr *n,
- struct tcf_chain *chain, int event,
- bool rtnl_held)
+ struct tcf_chain *chain, int event)
{
struct tcf_proto *tp;
- for (tp = tcf_get_next_proto(chain, NULL, rtnl_held);
- tp; tp = tcf_get_next_proto(chain, tp, rtnl_held))
+ for (tp = tcf_get_next_proto(chain, NULL);
+ tp; tp = tcf_get_next_proto(chain, tp))
tfilter_notify(net, oskb, n, tp, block,
- q, parent, NULL, event, false, rtnl_held);
+ q, parent, NULL, event, false, true);
}
static void tfilter_put(struct tcf_proto *tp, void *fh)
@@ -2262,7 +2260,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
if (prio == 0) {
tfilter_notify_chain(net, skb, block, q, parent, n,
- chain, RTM_DELTFILTER, rtnl_held);
+ chain, RTM_DELTFILTER);
tcf_chain_flush(chain, rtnl_held);
err = 0;
goto errout;
@@ -2895,7 +2893,7 @@ replay:
break;
case RTM_DELCHAIN:
tfilter_notify_chain(net, skb, block, q, parent, n,
- chain, RTM_DELTFILTER, true);
+ chain, RTM_DELTFILTER);
/* Flush the chain first as the user requested chain removal. */
tcf_chain_flush(chain, true);
/* In case the chain was successfully deleted, put a reference
@@ -2940,7 +2938,6 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
struct tcf_chain *chain;
long index_start;
long index;
- u32 parent;
int err;
if (nlmsg_len(cb->nlh) < sizeof(*tcm))
@@ -2955,13 +2952,6 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
if (!block)
goto out;
- /* If we work with block index, q is NULL and parent value
- * will never be used in the following code. The check
- * in tcf_fill_node prevents it. However, compiler does not
- * see that far, so set parent to zero to silence the warning
- * about parent being uninitialized.
- */
- parent = 0;
} else {
const struct Qdisc_class_ops *cops;
struct net_device *dev;
@@ -2971,13 +2961,11 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
if (!dev)
return skb->len;
- parent = tcm->tcm_parent;
- if (!parent) {
+ if (!tcm->tcm_parent)
q = dev->qdisc;
- parent = q->handle;
- } else {
+ else
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
- }
+
if (!q)
goto out;
cops = q->ops->cl_ops;
@@ -3712,7 +3700,7 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->gate.num_entries = tcf_gate_num_entries(act);
err = tcf_gate_get_entries(entry, act);
if (err)
- goto err_out;
+ goto err_out_locked;
} else {
err = -EOPNOTSUPP;
goto err_out_locked;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index fed18fd2c50b..84f932532db7 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1272,6 +1272,10 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
+ if (!nla_ok(nla_opt_msk, msk_depth)) {
+ NL_SET_ERR_MSG(extack, "Invalid nested attribute for masks");
+ return -EINVAL;
+ }
}
nla_for_each_attr(nla_opt_key, nla_enc_key,
@@ -1307,9 +1311,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
return -EINVAL;
}
-
- if (msk_depth)
- nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
break;
case TCA_FLOWER_KEY_ENC_OPTS_VXLAN:
if (key->enc_opts.dst_opt_type) {
@@ -1340,9 +1341,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
return -EINVAL;
}
-
- if (msk_depth)
- nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
break;
case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN:
if (key->enc_opts.dst_opt_type) {
@@ -1373,14 +1371,20 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
return -EINVAL;
}
-
- if (msk_depth)
- nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
break;
default:
NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
return -EINVAL;
}
+
+ if (!msk_depth)
+ continue;
+
+ if (!nla_ok(nla_opt_msk, msk_depth)) {
+ NL_SET_ERR_MSG(extack, "A mask attribute is invalid");
+ return -EINVAL;
+ }
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
}
return 0;
@@ -2424,8 +2428,8 @@ static int fl_dump_key_mpls_opt_lse(struct sk_buff *skb,
return err;
}
if (lse_mask->mpls_label) {
- err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL,
- lse_key->mpls_label);
+ err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL,
+ lse_key->mpls_label);
if (err)
return err;
}
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index d36949d9382c..2e288f88ff02 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -238,7 +238,7 @@ static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h)
}
}
- /* Something went wrong if we are trying to replace a non-existant
+ /* Something went wrong if we are trying to replace a non-existent
* node. Mind as well halt instead of silently failing.
*/
BUG_ON(1);
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 78bec347b8b6..c4007b9cd16d 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -366,9 +366,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
if (tb[TCA_TCINDEX_MASK])
cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
- if (tb[TCA_TCINDEX_SHIFT])
+ if (tb[TCA_TCINDEX_SHIFT]) {
cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
-
+ if (cp->shift > 16) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
if (!cp->hash) {
/* Hash not specified, use perfect hash if the upper limit
* of the hashing index is below the threshold.
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 7b69ab1993ba..6e1abe805448 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -79,7 +79,7 @@ struct tc_u_hnode {
/* The 'ht' field MUST be the last field in structure to allow for
* more entries allocated at end of structure.
*/
- struct tc_u_knode __rcu *ht[1];
+ struct tc_u_knode __rcu *ht[];
};
struct tc_u_common {
@@ -353,7 +353,7 @@ static int u32_init(struct tcf_proto *tp)
void *key = tc_u_common_ptr(tp);
struct tc_u_common *tp_c = tc_u_common_find(key);
- root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
+ root_ht = kzalloc(struct_size(root_ht, ht, 1), GFP_KERNEL);
if (root_ht == NULL)
return -ENOBUFS;
@@ -364,7 +364,7 @@ static int u32_init(struct tcf_proto *tp)
idr_init(&root_ht->handle_idr);
if (tp_c == NULL) {
- tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
+ tp_c = kzalloc(struct_size(tp_c, hlist->ht, 1), GFP_KERNEL);
if (tp_c == NULL) {
kfree(root_ht);
return -ENOBUFS;
@@ -933,7 +933,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
NL_SET_ERR_MSG_MOD(extack, "Divisor can only be used on a hash table");
return -EINVAL;
}
- ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
+ ht = kzalloc(struct_size(ht, ht, divisor + 1), GFP_KERNEL);
if (ht == NULL)
return -ENOBUFS;
if (handle == 0) {
@@ -1171,7 +1171,6 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
struct tcf_block *block = tp->chain->block;
struct tc_cls_u32_offload cls_u32 = {};
- int err;
tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack);
cls_u32.command = add ?
@@ -1194,13 +1193,9 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
cls_u32.knode.link_handle = ht->handle;
}
- err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32,
- &cls_u32, cb_priv, &n->flags,
- &n->in_hw_count);
- if (err)
- return err;
-
- return 0;
+ return tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32,
+ &cls_u32, cb_priv, &n->flags,
+ &n->in_hw_count);
}
static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index a4d09b1fb66a..f17b049ea530 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -41,7 +41,7 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
break;
case TCF_EM_ALIGN_U32:
- /* Worth checking boundries? The branching seems
+ /* Worth checking boundaries? The branching seems
* to get worse. Visit again.
*/
val = get_unaligned_be32(ptr);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 2a76a2f5ed88..6fe4e5cc807c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -412,7 +412,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
{
struct qdisc_rate_table *rtab;
- if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
+ if (tab == NULL || r->rate == 0 ||
+ r->cell_log == 0 || r->cell_log >= 32 ||
nla_len(tab) != TC_RTAB_SIZE) {
NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
return NULL;
@@ -1170,7 +1171,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
#ifdef CONFIG_MODULES
if (ops == NULL && kind != NULL) {
char name[IFNAMSIZ];
- if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
+ if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
/* We dropped the RTNL semaphore in order to
* perform the module load. So, even if we
* succeeded in loading the module we have to
@@ -1943,8 +1944,8 @@ static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
chain = tcf_get_next_chain(block, chain)) {
struct tcf_proto *tp;
- for (tp = tcf_get_next_proto(chain, NULL, true);
- tp; tp = tcf_get_next_proto(chain, tp, true)) {
+ for (tp = tcf_get_next_proto(chain, NULL);
+ tp; tp = tcf_get_next_proto(chain, tp)) {
struct tcf_bind_args arg = {};
arg.w.fn = tcf_node_bind;
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 1c281cc81f57..007bd2d9f1ff 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -466,10 +466,10 @@ drop: __maybe_unused
* non-ATM interfaces.
*/
-static void sch_atm_dequeue(unsigned long data)
+static void sch_atm_dequeue(struct tasklet_struct *t)
{
- struct Qdisc *sch = (struct Qdisc *)data;
- struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_qdisc_data *p = from_tasklet(p, t, task);
+ struct Qdisc *sch = qdisc_from_priv(p);
struct atm_flow_data *flow;
struct sk_buff *skb;
@@ -563,7 +563,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt,
if (err)
return err;
- tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch);
+ tasklet_setup(&p->task, sch_atm_dequeue);
return 0;
}
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index 2eaac2ff380f..459cc240eda9 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -50,6 +50,7 @@
* locredit = max_frame_size * (sendslope / port_transmit_rate)
*/
+#include <linux/ethtool.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index bd618b00d319..50f680f03a54 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -362,7 +362,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
ctl = nla_data(tb[TCA_CHOKE_PARMS]);
- if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log))
return -EINVAL;
if (ctl->limit > CHOKE_MAX_QUEUE)
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index 4dda15588cf4..949163fe68af 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -401,6 +401,7 @@ static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
INIT_LIST_HEAD(&q->new_flows);
INIT_LIST_HEAD(&q->old_flows);
+ timer_setup(&q->adapt_timer, fq_pie_timer, 0);
if (opt) {
err = fq_pie_change(sch, opt, extack);
@@ -426,7 +427,6 @@ static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
pie_vars_init(&flow->vars);
}
- timer_setup(&q->adapt_timer, fq_pie_timer, 0);
mod_timer(&q->adapt_timer, jiffies + HZ / 2);
return 0;
diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c
new file mode 100644
index 000000000000..e1e77d3fb6c0
--- /dev/null
+++ b/net/sched/sch_frag.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/ip6_fib.h>
+
+struct sch_frag_data {
+ unsigned long dst;
+ struct qdisc_skb_cb cb;
+ __be16 inner_protocol;
+ u16 vlan_tci;
+ __be16 vlan_proto;
+ unsigned int l2_len;
+ u8 l2_data[VLAN_ETH_HLEN];
+ int (*xmit)(struct sk_buff *skb);
+};
+
+static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage);
+
+static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage);
+
+ if (skb_cow_head(skb, data->l2_len) < 0) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ __skb_dst_copy(skb, data->dst);
+ *qdisc_skb_cb(skb) = data->cb;
+ skb->inner_protocol = data->inner_protocol;
+ if (data->vlan_tci & VLAN_CFI_MASK)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto,
+ data->vlan_tci & ~VLAN_CFI_MASK);
+ else
+ __vlan_hwaccel_clear_tag(skb);
+
+ /* Reconstruct the MAC header. */
+ skb_push(skb, data->l2_len);
+ memcpy(skb->data, &data->l2_data, data->l2_len);
+ skb_postpush_rcsum(skb, skb->data, data->l2_len);
+ skb_reset_mac_header(skb);
+
+ return data->xmit(skb);
+}
+
+static void sch_frag_prepare_frag(struct sk_buff *skb,
+ int (*xmit)(struct sk_buff *skb))
+{
+ unsigned int hlen = skb_network_offset(skb);
+ struct sch_frag_data *data;
+
+ data = this_cpu_ptr(&sch_frag_data_storage);
+ data->dst = skb->_skb_refdst;
+ data->cb = *qdisc_skb_cb(skb);
+ data->xmit = xmit;
+ data->inner_protocol = skb->inner_protocol;
+ if (skb_vlan_tag_present(skb))
+ data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK;
+ else
+ data->vlan_tci = 0;
+ data->vlan_proto = skb->vlan_proto;
+ data->l2_len = hlen;
+ memcpy(&data->l2_data, skb->data, hlen);
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ skb_pull(skb, hlen);
+}
+
+static unsigned int
+sch_frag_dst_get_mtu(const struct dst_entry *dst)
+{
+ return dst->dev->mtu;
+}
+
+static struct dst_ops sch_frag_dst_ops = {
+ .family = AF_UNSPEC,
+ .mtu = sch_frag_dst_get_mtu,
+};
+
+static int sch_fragment(struct net *net, struct sk_buff *skb,
+ u16 mru, int (*xmit)(struct sk_buff *skb))
+{
+ int ret = -1;
+
+ if (skb_network_offset(skb) > VLAN_ETH_HLEN) {
+ net_warn_ratelimited("L2 header too long to fragment\n");
+ goto err;
+ }
+
+ if (skb_protocol(skb, true) == htons(ETH_P_IP)) {
+ struct dst_entry sch_frag_dst;
+ unsigned long orig_dst;
+
+ sch_frag_prepare_frag(skb, xmit);
+ dst_init(&sch_frag_dst, &sch_frag_dst_ops, NULL, 1,
+ DST_OBSOLETE_NONE, DST_NOCOUNT);
+ sch_frag_dst.dev = skb->dev;
+
+ orig_dst = skb->_skb_refdst;
+ skb_dst_set_noref(skb, &sch_frag_dst);
+ IPCB(skb)->frag_max_size = mru;
+
+ ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit);
+ refdst_drop(orig_dst);
+ } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) {
+ unsigned long orig_dst;
+ struct rt6_info sch_frag_rt;
+
+ sch_frag_prepare_frag(skb, xmit);
+ memset(&sch_frag_rt, 0, sizeof(sch_frag_rt));
+ dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1,
+ DST_OBSOLETE_NONE, DST_NOCOUNT);
+ sch_frag_rt.dst.dev = skb->dev;
+
+ orig_dst = skb->_skb_refdst;
+ skb_dst_set_noref(skb, &sch_frag_rt.dst);
+ IP6CB(skb)->frag_max_size = mru;
+
+ ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb,
+ sch_frag_xmit);
+ refdst_drop(orig_dst);
+ } else {
+ net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n",
+ netdev_name(skb->dev),
+ ntohs(skb_protocol(skb, true)), mru,
+ skb->dev->mtu);
+ goto err;
+ }
+
+ return ret;
+err:
+ kfree_skb(skb);
+ return ret;
+}
+
+int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb))
+{
+ u16 mru = qdisc_skb_cb(skb)->mru;
+ int err;
+
+ if (mru && skb->len > mru + skb->dev->hard_header_len)
+ err = sch_fragment(dev_net(skb->dev), skb, mru, xmit);
+ else
+ err = xmit(skb);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sch_frag_xmit_hook);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 54c417244642..49eae93d1489 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -802,9 +802,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
const struct Qdisc_ops *ops,
struct netlink_ext_ack *extack)
{
- void *p;
struct Qdisc *sch;
- unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
+ unsigned int size = sizeof(*sch) + ops->priv_size;
int err = -ENOBUFS;
struct net_device *dev;
@@ -815,22 +814,10 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
}
dev = dev_queue->dev;
- p = kzalloc_node(size, GFP_KERNEL,
- netdev_queue_numa_node_read(dev_queue));
+ sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue));
- if (!p)
+ if (!sch)
goto errout;
- sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
- /* if we got non aligned memory, ask more and do alignment ourself */
- if (sch != p) {
- kfree(p);
- p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
- netdev_queue_numa_node_read(dev_queue));
- if (!p)
- goto errout;
- sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
- sch->padded = (char *) sch - (char *) p;
- }
__skb_queue_head_init(&sch->gso_skb);
__skb_queue_head_init(&sch->skb_bad_txq);
qdisc_skb_head_init(&sch->q);
@@ -873,7 +860,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
return sch;
errout1:
- kfree(p);
+ kfree(sch);
errout:
return ERR_PTR(err);
}
@@ -941,7 +928,7 @@ void qdisc_free(struct Qdisc *qdisc)
free_percpu(qdisc->cpu_qstats);
}
- kfree((char *) qdisc - qdisc->padded);
+ kfree(qdisc);
}
static void qdisc_free_cb(struct rcu_head *head)
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 8599c6f31b05..e0bc77533acc 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -480,7 +480,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
struct gred_sched *table = qdisc_priv(sch);
struct gred_sched_data *q = table->tab[dp];
- if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) {
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) {
NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters");
return -EINVAL;
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 84f82771cdf5..0c345e43a09a 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -330,7 +330,7 @@ static s64 tabledist(s64 mu, s32 sigma,
/* default uniform distribution */
if (dist == NULL)
- return ((rnd % (2 * sigma)) + mu) - sigma;
+ return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
t = dist->table[rnd % dist->size];
x = (sigma % NETEM_DIST_SCALE) * t;
@@ -812,6 +812,10 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
q->slot_config.max_packets = INT_MAX;
if (q->slot_config.max_bytes == 0)
q->slot_config.max_bytes = INT_MAX;
+
+ /* capping dist_jitter to the range acceptable by tabledist() */
+ q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
+
q->slot.packets_left = q->slot_config.max_packets;
q->slot.bytes_left = q->slot_config.max_bytes;
if (q->slot_config.min_delay | q->slot_config.max_delay |
@@ -1037,6 +1041,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_NETEM_SLOT])
get_slot(q, tb[TCA_NETEM_SLOT]);
+ /* capping jitter to the range acceptable by tabledist() */
+ q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
+
return ret;
get_table_failure:
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index c65077f0c0f3..5a457ff61acd 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -405,7 +405,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
/* We restart the measurement cycle if the following conditions are met
* 1. If the delay has been low for 2 consecutive Tupdate periods
* 2. Calculated drop probability is zero
- * 3. If average dq_rate_estimator is enabled, we have atleast one
+ * 3. If average dq_rate_estimator is enabled, we have at least one
* estimate for the avg_dq_rate ie., is a non-zero value
*/
if ((vars->qdelay < params->target / 2) &&
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index e89fab6ccb34..b4ae34d7aa96 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -250,7 +250,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb,
max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
ctl = nla_data(tb[TCA_RED_PARMS]);
- if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log))
return -EINVAL;
err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS,
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index bca2be57d9fc..b25e51440623 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -647,7 +647,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
}
if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
- ctl_v1->Wlog))
+ ctl_v1->Wlog, ctl_v1->Scell_log))
return -EINVAL;
if (ctl_v1 && ctl_v1->qth_min) {
p = kmalloc(sizeof(*p), GFP_KERNEL);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index b0ad7687ee2c..6f775275826a 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -6,6 +6,7 @@
*
*/
+#include <linux/ethtool.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -1596,6 +1597,22 @@ free_sched:
return err;
}
+static void taprio_reset(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int i;
+
+ hrtimer_cancel(&q->advance_timer);
+ if (q->qdiscs) {
+ for (i = 0; i < dev->num_tx_queues; i++)
+ if (q->qdiscs[i])
+ qdisc_reset(q->qdiscs[i]);
+ }
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+}
+
static void taprio_destroy(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
@@ -1606,12 +1623,11 @@ static void taprio_destroy(struct Qdisc *sch)
list_del(&q->taprio_list);
spin_unlock(&taprio_list_lock);
- hrtimer_cancel(&q->advance_timer);
taprio_disable_offload(dev, q, NULL);
if (q->qdiscs) {
- for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
+ for (i = 0; i < dev->num_tx_queues; i++)
qdisc_put(q->qdiscs[i]);
kfree(q->qdiscs);
@@ -1953,6 +1969,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
.init = taprio_init,
.change = taprio_change,
.destroy = taprio_destroy,
+ .reset = taprio_reset,
.peek = taprio_peek,
.dequeue = taprio_dequeue,
.enqueue = taprio_enqueue,
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index 39d7fa9569f8..5da599ff84a9 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -11,6 +11,7 @@ menuconfig IP_SCTP
select CRYPTO_HMAC
select CRYPTO_SHA1
select LIBCRC32C
+ select NET_UDP_TUNNEL
help
Stream Control Transmission Protocol
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 8d735461fa19..336df4b36655 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -99,6 +99,8 @@ static struct sctp_association *sctp_association_init(
*/
asoc->hbinterval = msecs_to_jiffies(sp->hbinterval);
+ asoc->encap_port = sp->encap_port;
+
/* Initialize path max retrans value. */
asoc->pathmaxrxt = sp->pathmaxrxt;
@@ -624,6 +626,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
*/
peer->hbinterval = asoc->hbinterval;
+ peer->encap_port = asoc->encap_port;
+
/* Set the path max_retrans. */
peer->pathmaxrxt = asoc->pathmaxrxt;
@@ -1351,7 +1355,7 @@ static void sctp_select_active_and_retran_path(struct sctp_association *asoc)
}
/* We did not find anything useful for a possible retransmission
- * path; either primary path that we found is the the same as
+ * path; either primary path that we found is the same as
* the current one, or we didn't generally find an active one.
*/
if (trans_sec == NULL)
@@ -1537,7 +1541,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
/* If we've reached or overflowed our receive buffer, announce
* a 0 rwnd if rwnd would still be positive. Store the
- * the potential pressure overflow so that the window can be restored
+ * potential pressure overflow so that the window can be restored
* back to original value.
*/
if (rx_count >= asoc->base.sk->sk_rcvbuf)
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 7e59d8a18f3e..6f8319b828b0 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -445,7 +445,7 @@ struct sctp_shared_key *sctp_auth_get_shkey(
}
/*
- * Initialize all the possible digest transforms that we can use. Right now
+ * Initialize all the possible digest transforms that we can use. Right
* now, the supported digests are SHA1 and SHA256. We do this here once
* because of the restrictiong that transforms may only be allocated in
* user context. This forces us to pre-allocated all possible transforms
@@ -811,7 +811,7 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep,
}
/* Set a new shared key on either endpoint or association. If the
- * the key with a same ID already exists, replace the key (remove the
+ * key with a same ID already exists, replace the key (remove the
* old key and add a new one).
*/
int sctp_auth_set_key(struct sctp_endpoint *ep,
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 701c5a4e441d..53e5ed79f63f 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -505,7 +505,7 @@ int sctp_in_scope(struct net *net, const union sctp_addr *addr,
return 0;
/*
* For INIT and INIT-ACK address list, let L be the level of
- * of requested destination address, sender and receiver
+ * requested destination address, sender and receiver
* SHOULD include all of its addresses with level greater
* than or equal to L.
*
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index ab6a997e222f..fd4f8243cc35 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -179,7 +179,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
__func__, asoc, max_data);
}
- /* If the the peer requested that we authenticate DATA chunks
+ /* If the peer requested that we authenticate DATA chunks
* we need to account for bundling of the AUTH chunks along with
* DATA.
*/
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 55d4fc6f371d..d508f6f3dd08 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -449,7 +449,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
else {
if (!mod_timer(&t->proto_unreach_timer,
jiffies + (HZ/20)))
- sctp_association_hold(asoc);
+ sctp_transport_hold(t);
}
} else {
struct net *net = sock_net(sk);
@@ -458,7 +458,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
"encountered!\n", __func__);
if (del_timer(&t->proto_unreach_timer))
- sctp_association_put(asoc);
+ sctp_transport_put(t);
sctp_do_sm(net, SCTP_EVENT_T_OTHER,
SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 8a58f42d6d19..c3e89c776e66 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -55,6 +55,7 @@
#include <net/inet_common.h>
#include <net/inet_ecn.h>
#include <net/sctp/sctp.h>
+#include <net/udp_tunnel.h>
#include <linux/uaccess.h>
@@ -191,33 +192,53 @@ out:
return ret;
}
-static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
+static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
{
+ struct dst_entry *dst = dst_clone(t->dst);
+ struct flowi6 *fl6 = &t->fl.u.ip6;
struct sock *sk = skb->sk;
struct ipv6_pinfo *np = inet6_sk(sk);
- struct flowi6 *fl6 = &transport->fl.u.ip6;
__u8 tclass = np->tclass;
- int res;
+ __be32 label;
pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
skb->len, &fl6->saddr, &fl6->daddr);
- if (transport->dscp & SCTP_DSCP_SET_MASK)
- tclass = transport->dscp & SCTP_DSCP_VAL_MASK;
+ if (t->dscp & SCTP_DSCP_SET_MASK)
+ tclass = t->dscp & SCTP_DSCP_VAL_MASK;
if (INET_ECN_is_capable(tclass))
IP6_ECN_flow_xmit(sk, fl6->flowlabel);
- if (!(transport->param_flags & SPP_PMTUD_ENABLE))
+ if (!(t->param_flags & SPP_PMTUD_ENABLE))
skb->ignore_df = 1;
SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
- rcu_read_lock();
- res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
- tclass, sk->sk_priority);
- rcu_read_unlock();
- return res;
+ if (!t->encap_port || !sctp_sk(sk)->udp_port) {
+ int res;
+
+ skb_dst_set(skb, dst);
+ rcu_read_lock();
+ res = ip6_xmit(sk, skb, fl6, sk->sk_mark,
+ rcu_dereference(np->opt),
+ tclass, sk->sk_priority);
+ rcu_read_unlock();
+ return res;
+ }
+
+ if (skb_is_gso(skb))
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+
+ skb->encapsulation = 1;
+ skb_reset_inner_mac_header(skb);
+ skb_reset_inner_transport_header(skb);
+ skb_set_inner_ipproto(skb, IPPROTO_SCTP);
+ label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6);
+
+ return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr,
+ &fl6->daddr, tclass, ip6_dst_hoplimit(dst),
+ label, sctp_sk(sk)->udp_port, t->encap_port, false);
}
/* Returns the dst cache entry for the given source and destination ip
@@ -1053,6 +1074,7 @@ static struct inet_protosw sctpv6_stream_protosw = {
static int sctp6_rcv(struct sk_buff *skb)
{
+ SCTP_INPUT_CB(skb)->encap_port = 0;
return sctp_rcv(skb) ? -1 : 0;
}
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 74847d613835..ce281a9a2875 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -27,7 +27,11 @@ static __le32 sctp_gso_make_checksum(struct sk_buff *skb)
{
skb->ip_summed = CHECKSUM_NONE;
skb->csum_not_inet = 0;
- gso_reset_checksum(skb, ~0);
+ /* csum and csum_start in GSO CB may be needed to do the UDP
+ * checksum when it's a UDP tunneling packet.
+ */
+ SKB_GSO_CB(skb)->csum = (__force __wsum)~0;
+ SKB_GSO_CB(skb)->csum_start = skb_headroom(skb) + skb->len;
return sctp_compute_cksum(skb, skb_transport_offset(skb));
}
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 1441eaf460bb..6614c9fdc51e 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -508,20 +508,14 @@ merge:
sizeof(struct inet6_skb_parm)));
skb_shinfo(head)->gso_segs = pkt_count;
skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
- rcu_read_lock();
- if (skb_dst(head) != tp->dst) {
- dst_hold(tp->dst);
- sk_setup_caps(sk, tp->dst);
- }
- rcu_read_unlock();
goto chksum;
}
if (sctp_checksum_disable)
return 1;
- if (!(skb_dst(head)->dev->features & NETIF_F_SCTP_CRC) ||
- dst_xfrm(skb_dst(head)) || packet->ipfragok) {
+ if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) ||
+ dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) {
struct sctphdr *sh =
(struct sctphdr *)skb_transport_header(head);
@@ -548,7 +542,6 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
struct sctp_association *asoc = tp->asoc;
struct sctp_chunk *chunk, *tmp;
int pkt_count, gso = 0;
- struct dst_entry *dst;
struct sk_buff *head;
struct sctphdr *sh;
struct sock *sk;
@@ -585,13 +578,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
sh->checksum = 0;
/* drop packet if no dst */
- dst = dst_clone(tp->dst);
- if (!dst) {
+ if (!tp->dst) {
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
kfree_skb(head);
goto out;
}
- skb_dst_set(head, dst);
+
+ rcu_read_lock();
+ if (__sk_dst_get(sk) != tp->dst) {
+ dst_hold(tp->dst);
+ sk_setup_caps(sk, tp->dst);
+ }
+ rcu_read_unlock();
/* pack up chunks */
pkt_count = sctp_packet_pack(packet, head, gso, gfp);
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index f7da88ae20a5..982a87b3e11f 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -215,6 +215,12 @@ static void sctp_transport_seq_stop(struct seq_file *seq, void *v)
{
struct sctp_ht_iter *iter = seq->private;
+ if (v && v != SEQ_START_TOKEN) {
+ struct sctp_transport *transport = v;
+
+ sctp_transport_put(transport);
+ }
+
sctp_transport_walk_stop(&iter->hti);
}
@@ -222,6 +228,12 @@ static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct sctp_ht_iter *iter = seq->private;
+ if (v && v != SEQ_START_TOKEN) {
+ struct sctp_transport *transport = v;
+
+ sctp_transport_put(transport);
+ }
+
++*pos;
return sctp_transport_get_next(seq_file_net(seq), &iter->hti);
@@ -277,8 +289,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
sk->sk_rcvbuf);
seq_printf(seq, "\n");
- sctp_transport_put(transport);
-
return 0;
}
@@ -354,8 +364,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
- sctp_transport_put(transport);
-
return 0;
}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d19db22262fd..6f2bbfeec3a4 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -44,6 +44,7 @@
#include <net/addrconf.h>
#include <net/inet_common.h>
#include <net/inet_ecn.h>
+#include <net/udp_tunnel.h>
#define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024)
@@ -372,7 +373,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
* Level 3 - private addresses.
* Level 4 - global addresses
* For INIT and INIT-ACK address list, let L be the level of
- * of requested destination address, sender and receiver
+ * requested destination address, sender and receiver
* SHOULD include all of its addresses with level greater
* than or equal to L.
*
@@ -840,6 +841,92 @@ static int sctp_ctl_sock_init(struct net *net)
return 0;
}
+static int sctp_udp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ SCTP_INPUT_CB(skb)->encap_port = udp_hdr(skb)->source;
+
+ skb_set_transport_header(skb, sizeof(struct udphdr));
+ sctp_rcv(skb);
+ return 0;
+}
+
+static int sctp_udp_err_lookup(struct sock *sk, struct sk_buff *skb)
+{
+ struct sctp_association *asoc;
+ struct sctp_transport *t;
+ int family;
+
+ skb->transport_header += sizeof(struct udphdr);
+ family = (ip_hdr(skb)->version == 4) ? AF_INET : AF_INET6;
+ sk = sctp_err_lookup(dev_net(skb->dev), family, skb, sctp_hdr(skb),
+ &asoc, &t);
+ if (!sk)
+ return -ENOENT;
+
+ sctp_err_finish(sk, t);
+ return 0;
+}
+
+int sctp_udp_sock_start(struct net *net)
+{
+ struct udp_tunnel_sock_cfg tuncfg = {NULL};
+ struct udp_port_cfg udp_conf = {0};
+ struct socket *sock;
+ int err;
+
+ udp_conf.family = AF_INET;
+ udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+ udp_conf.local_udp_port = htons(net->sctp.udp_port);
+ err = udp_sock_create(net, &udp_conf, &sock);
+ if (err) {
+ pr_err("Failed to create the SCTP UDP tunneling v4 sock\n");
+ return err;
+ }
+
+ tuncfg.encap_type = 1;
+ tuncfg.encap_rcv = sctp_udp_rcv;
+ tuncfg.encap_err_lookup = sctp_udp_err_lookup;
+ setup_udp_tunnel_sock(net, sock, &tuncfg);
+ net->sctp.udp4_sock = sock->sk;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ memset(&udp_conf, 0, sizeof(udp_conf));
+
+ udp_conf.family = AF_INET6;
+ udp_conf.local_ip6 = in6addr_any;
+ udp_conf.local_udp_port = htons(net->sctp.udp_port);
+ udp_conf.use_udp6_rx_checksums = true;
+ udp_conf.ipv6_v6only = true;
+ err = udp_sock_create(net, &udp_conf, &sock);
+ if (err) {
+ pr_err("Failed to create the SCTP UDP tunneling v6 sock\n");
+ udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket);
+ net->sctp.udp4_sock = NULL;
+ return err;
+ }
+
+ tuncfg.encap_type = 1;
+ tuncfg.encap_rcv = sctp_udp_rcv;
+ tuncfg.encap_err_lookup = sctp_udp_err_lookup;
+ setup_udp_tunnel_sock(net, sock, &tuncfg);
+ net->sctp.udp6_sock = sock->sk;
+#endif
+
+ return 0;
+}
+
+void sctp_udp_sock_stop(struct net *net)
+{
+ if (net->sctp.udp4_sock) {
+ udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket);
+ net->sctp.udp4_sock = NULL;
+ }
+ if (net->sctp.udp6_sock) {
+ udp_tunnel_sock_release(net->sctp.udp6_sock->sk_socket);
+ net->sctp.udp6_sock = NULL;
+ }
+}
+
/* Register address family specific functions. */
int sctp_register_af(struct sctp_af *af)
{
@@ -971,25 +1058,44 @@ static int sctp_inet_supported_addrs(const struct sctp_sock *opt,
}
/* Wrapper routine that calls the ip transmit routine. */
-static inline int sctp_v4_xmit(struct sk_buff *skb,
- struct sctp_transport *transport)
+static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
{
- struct inet_sock *inet = inet_sk(skb->sk);
+ struct dst_entry *dst = dst_clone(t->dst);
+ struct flowi4 *fl4 = &t->fl.u.ip4;
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(sk);
__u8 dscp = inet->tos;
+ __be16 df = 0;
pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
- skb->len, &transport->fl.u.ip4.saddr,
- &transport->fl.u.ip4.daddr);
+ skb->len, &fl4->saddr, &fl4->daddr);
- if (transport->dscp & SCTP_DSCP_SET_MASK)
- dscp = transport->dscp & SCTP_DSCP_VAL_MASK;
+ if (t->dscp & SCTP_DSCP_SET_MASK)
+ dscp = t->dscp & SCTP_DSCP_VAL_MASK;
+
+ inet->pmtudisc = t->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO
+ : IP_PMTUDISC_DONT;
+ SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
- inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ?
- IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
+ if (!t->encap_port || !sctp_sk(sk)->udp_port) {
+ skb_dst_set(skb, dst);
+ return __ip_queue_xmit(sk, skb, &t->fl, dscp);
+ }
- SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS);
+ if (skb_is_gso(skb))
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
- return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp);
+ if (ip_dont_fragment(sk, dst) && !skb->ignore_df)
+ df = htons(IP_DF);
+
+ skb->encapsulation = 1;
+ skb_reset_inner_mac_header(skb);
+ skb_reset_inner_transport_header(skb);
+ skb_set_inner_ipproto(skb, IPPROTO_SCTP);
+ udp_tunnel_xmit_skb((struct rtable *)dst, sk, skb, fl4->saddr,
+ fl4->daddr, dscp, ip4_dst_hoplimit(dst), df,
+ sctp_sk(sk)->udp_port, t->encap_port, false, false);
+ return 0;
}
static struct sctp_af sctp_af_inet;
@@ -1054,9 +1160,15 @@ static struct inet_protosw sctp_stream_protosw = {
.flags = SCTP_PROTOSW_FLAG
};
+static int sctp4_rcv(struct sk_buff *skb)
+{
+ SCTP_INPUT_CB(skb)->encap_port = 0;
+ return sctp_rcv(skb);
+}
+
/* Register with IP layer. */
static const struct net_protocol sctp_protocol = {
- .handler = sctp_rcv,
+ .handler = sctp4_rcv,
.err_handler = sctp_v4_err,
.no_policy = 1,
.netns_ok = 1,
@@ -1271,6 +1383,12 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Enable ECN by default. */
net->sctp.ecn_enable = 1;
+ /* Set UDP tunneling listening port to 0 by default */
+ net->sctp.udp_port = 0;
+
+ /* Set remote encap port to 0 by default */
+ net->sctp.encap_port = 0;
+
/* Set SCOPE policy to enabled */
net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE;
@@ -1483,10 +1601,10 @@ static __init int sctp_init(void)
num_entries = (1UL << order) * PAGE_SIZE /
sizeof(struct sctp_bind_hashbucket);
- /* And finish by rounding it down to the nearest power of two
- * this wastes some memory of course, but its needed because
+ /* And finish by rounding it down to the nearest power of two.
+ * This wastes some memory of course, but it's needed because
* the hash function operates based on the assumption that
- * that the number of entries is a power of two
+ * the number of entries is a power of two.
*/
sctp_port_hashsize = rounddown_pow_of_two(num_entries);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index c11c24524652..f77484df097b 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1142,6 +1142,26 @@ nodata:
return retval;
}
+struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk)
+{
+ struct sctp_new_encap_port_hdr nep;
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_abort(asoc, chunk,
+ sizeof(struct sctp_errhdr) + sizeof(nep));
+ if (!retval)
+ goto nodata;
+
+ sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep));
+ nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port;
+ nep.new_port = chunk->transport->encap_port;
+ sctp_addto_chunk(retval, sizeof(nep), &nep);
+
+nodata:
+ return retval;
+}
+
/* Make a HEARTBEAT chunk. */
struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
const struct sctp_transport *transport)
@@ -1235,7 +1255,7 @@ nodata:
/* Create an Operation Error chunk of a fixed size, specifically,
* min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads.
- * This is a helper function to allocate an error chunk for for those
+ * This is a helper function to allocate an error chunk for those
* invalid parameter codes in which we may not want to report all the
* errors, if the incoming chunk is large. If it can't fit in a single
* packet, we ignore it.
@@ -1780,7 +1800,7 @@ no_hmac:
* for init collision case of lost COOKIE ACK.
* If skb has been timestamped, then use the stamp, otherwise
* use current time. This introduces a small possibility that
- * that a cookie may be considered expired, but his would only slow
+ * a cookie may be considered expired, but this would only slow
* down the new association establishment instead of every packet.
*/
if (sock_flag(ep->base.sk, SOCK_TIMESTAMP))
@@ -2319,8 +2339,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
/* This implementation defaults to making the first transport
* added as the primary transport. The source address seems to
- * be a a better choice than any of the embedded addresses.
+ * be a better choice than any of the embedded addresses.
*/
+ asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port;
if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
goto nomem;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index aa821e71f05e..0948f14ce221 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -419,7 +419,7 @@ void sctp_generate_proto_unreach_event(struct timer_list *t)
/* Try again later. */
if (!mod_timer(&transport->proto_unreach_timer,
jiffies + (HZ/20)))
- sctp_association_hold(asoc);
+ sctp_transport_hold(transport);
goto out_unlock;
}
@@ -435,7 +435,7 @@ void sctp_generate_proto_unreach_event(struct timer_list *t)
out_unlock:
bh_unlock_sock(sk);
- sctp_association_put(asoc);
+ sctp_transport_put(transport);
}
/* Handle the timeout of the RE-CONFIG timer. */
@@ -1601,12 +1601,12 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type,
break;
case SCTP_CMD_INIT_FAILED:
- sctp_cmd_init_failed(commands, asoc, cmd->obj.u32);
+ sctp_cmd_init_failed(commands, asoc, cmd->obj.u16);
break;
case SCTP_CMD_ASSOC_FAILED:
sctp_cmd_assoc_failed(commands, asoc, event_type,
- subtype, chunk, cmd->obj.u32);
+ subtype, chunk, cmd->obj.u16);
break;
case SCTP_CMD_INIT_COUNTER_INC:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index c669f8bd1eab..af2b7041fa4e 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -87,6 +87,13 @@ static enum sctp_disposition sctp_sf_tabort_8_4_8(
const union sctp_subtype type,
void *arg,
struct sctp_cmd_seq *commands);
+static enum sctp_disposition sctp_sf_new_encap_port(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const union sctp_subtype type,
+ void *arg,
+ struct sctp_cmd_seq *commands);
static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
static enum sctp_disposition sctp_stop_t1_and_abort(
@@ -1493,6 +1500,10 @@ static enum sctp_disposition sctp_sf_do_unexpected_init(
if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
commands);
+
+ if (SCTP_INPUT_CB(chunk->skb)->encap_port != chunk->transport->encap_port)
+ return sctp_sf_new_encap_port(net, ep, asoc, type, arg, commands);
+
/* Grab the INIT header. */
chunk->subh.init_hdr = (struct sctp_inithdr *)chunk->skb->data;
@@ -3392,6 +3403,45 @@ static enum sctp_disposition sctp_sf_tabort_8_4_8(
sctp_packet_append_chunk(packet, abort);
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(packet));
+
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+
+ sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/* Handling of SCTP Packets Containing an INIT Chunk Matching an
+ * Existing Associations when the UDP encap port is incorrect.
+ *
+ * From Section 4 at draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.
+ */
+static enum sctp_disposition sctp_sf_new_encap_port(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const union sctp_subtype type,
+ void *arg,
+ struct sctp_cmd_seq *commands)
+{
+ struct sctp_packet *packet = NULL;
+ struct sctp_chunk *chunk = arg;
+ struct sctp_chunk *abort;
+
+ packet = sctp_ootb_pkt_new(net, asoc, chunk);
+ if (!packet)
+ return SCTP_DISPOSITION_NOMEM;
+
+ abort = sctp_make_new_encap_port(asoc, chunk);
+ if (!abort) {
+ sctp_ootb_pkt_free(packet);
+ return SCTP_DISPOSITION_NOMEM;
+ }
+
+ abort->skb->sk = ep->base.sk;
+
+ sctp_packet_append_chunk(packet, abort);
+
sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
SCTP_PACKET(packet));
@@ -6268,6 +6318,8 @@ static struct sctp_packet *sctp_ootb_pkt_new(
if (!transport)
goto nomem;
+ transport->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port;
+
/* Cache a route for the transport with the chunk's destination as
* the source address.
*/
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 53d0a4161df3..a710917c5ac7 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4417,6 +4417,55 @@ out:
return retval;
}
+static int sctp_setsockopt_encap_port(struct sock *sk,
+ struct sctp_udpencaps *encap,
+ unsigned int optlen)
+{
+ struct sctp_association *asoc;
+ struct sctp_transport *t;
+ __be16 encap_port;
+
+ if (optlen != sizeof(*encap))
+ return -EINVAL;
+
+ /* If an address other than INADDR_ANY is specified, and
+ * no transport is found, then the request is invalid.
+ */
+ encap_port = (__force __be16)encap->sue_port;
+ if (!sctp_is_any(sk, (union sctp_addr *)&encap->sue_address)) {
+ t = sctp_addr_id2transport(sk, &encap->sue_address,
+ encap->sue_assoc_id);
+ if (!t)
+ return -EINVAL;
+
+ t->encap_port = encap_port;
+ return 0;
+ }
+
+ /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the
+ * socket is a one to many style socket, and an association
+ * was not found, then the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, encap->sue_assoc_id);
+ if (!asoc && encap->sue_assoc_id != SCTP_FUTURE_ASSOC &&
+ sctp_style(sk, UDP))
+ return -EINVAL;
+
+ /* If changes are for association, also apply encap_port to
+ * each transport.
+ */
+ if (asoc) {
+ list_for_each_entry(t, &asoc->peer.transport_addr_list,
+ transports)
+ t->encap_port = encap_port;
+
+ return 0;
+ }
+
+ sctp_sk(sk)->encap_port = encap_port;
+ return 0;
+}
+
/* API 6.2 setsockopt(), getsockopt()
*
* Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4636,6 +4685,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE:
retval = sctp_setsockopt_pf_expose(sk, kopt, optlen);
break;
+ case SCTP_REMOTE_UDP_ENCAPS_PORT:
+ retval = sctp_setsockopt_encap_port(sk, kopt, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -4876,6 +4928,8 @@ static int sctp_init_sock(struct sock *sk)
* be modified via SCTP_PEER_ADDR_PARAMS
*/
sp->hbinterval = net->sctp.hb_interval;
+ sp->udp_port = htons(net->sctp.udp_port);
+ sp->encap_port = htons(net->sctp.encap_port);
sp->pathmaxrxt = net->sctp.max_retrans_path;
sp->pf_retrans = net->sctp.pf_retrans;
sp->ps_retrans = net->sctp.ps_retrans;
@@ -7790,6 +7844,65 @@ out:
return retval;
}
+static int sctp_getsockopt_encap_port(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_association *asoc;
+ struct sctp_udpencaps encap;
+ struct sctp_transport *t;
+ __be16 encap_port;
+
+ if (len < sizeof(encap))
+ return -EINVAL;
+
+ len = sizeof(encap);
+ if (copy_from_user(&encap, optval, len))
+ return -EFAULT;
+
+ /* If an address other than INADDR_ANY is specified, and
+ * no transport is found, then the request is invalid.
+ */
+ if (!sctp_is_any(sk, (union sctp_addr *)&encap.sue_address)) {
+ t = sctp_addr_id2transport(sk, &encap.sue_address,
+ encap.sue_assoc_id);
+ if (!t) {
+ pr_debug("%s: failed no transport\n", __func__);
+ return -EINVAL;
+ }
+
+ encap_port = t->encap_port;
+ goto out;
+ }
+
+ /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the
+ * socket is a one to many style socket, and an association
+ * was not found, then the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, encap.sue_assoc_id);
+ if (!asoc && encap.sue_assoc_id != SCTP_FUTURE_ASSOC &&
+ sctp_style(sk, UDP)) {
+ pr_debug("%s: failed no association\n", __func__);
+ return -EINVAL;
+ }
+
+ if (asoc) {
+ encap_port = asoc->encap_port;
+ goto out;
+ }
+
+ encap_port = sctp_sk(sk)->encap_port;
+
+out:
+ encap.sue_port = (__force uint16_t)encap_port;
+ if (copy_to_user(optval, &encap, len))
+ return -EFAULT;
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+}
+
static int sctp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -8010,6 +8123,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE:
retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen);
break;
+ case SCTP_REMOTE_UDP_ENCAPS_PORT:
+ retval = sctp_getsockopt_encap_port(sk, len, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index c16c80963e55..e92df779af73 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -36,6 +36,7 @@ static int rto_alpha_max = 1000;
static int rto_beta_max = 1000;
static int pf_expose_max = SCTP_PF_EXPOSE_MAX;
static int ps_retrans_max = SCTP_PS_RETRANS_MAX;
+static int udp_port_max = 65535;
static unsigned long max_autoclose_min = 0;
static unsigned long max_autoclose_max =
@@ -48,6 +49,8 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos);
static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos);
+static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos);
static int proc_sctp_do_auth(struct ctl_table *ctl, int write,
@@ -291,6 +294,24 @@ static struct ctl_table sctp_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "udp_port",
+ .data = &init_net.sctp.udp_port,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_sctp_do_udp_port,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &udp_port_max,
+ },
+ {
+ .procname = "encap_port",
+ .data = &init_net.sctp.encap_port,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &udp_port_max,
+ },
+ {
.procname = "addr_scope_policy",
.data = &init_net.sctp.scope_policy,
.maxlen = sizeof(int),
@@ -477,6 +498,47 @@ static int proc_sctp_do_auth(struct ctl_table *ctl, int write,
return ret;
}
+static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ unsigned int min = *(unsigned int *)ctl->extra1;
+ unsigned int max = *(unsigned int *)ctl->extra2;
+ struct ctl_table tbl;
+ int ret, new_value;
+
+ memset(&tbl, 0, sizeof(struct ctl_table));
+ tbl.maxlen = sizeof(unsigned int);
+
+ if (write)
+ tbl.data = &new_value;
+ else
+ tbl.data = &net->sctp.udp_port;
+
+ ret = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0) {
+ struct sock *sk = net->sctp.ctl_sock;
+
+ if (new_value > max || new_value < min)
+ return -EINVAL;
+
+ net->sctp.udp_port = new_value;
+ sctp_udp_sock_stop(net);
+ if (new_value) {
+ ret = sctp_udp_sock_start(net);
+ if (ret)
+ net->sctp.udp_port = 0;
+ }
+
+ /* Update the value in the control socket */
+ lock_sock(sk);
+ sctp_sk(sk)->udp_port = htons(net->sctp.udp_port);
+ release_sock(sk);
+ }
+
+ return ret;
+}
+
int sctp_sysctl_net_register(struct net *net)
{
struct ctl_table *table;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 806af58f4375..bf0ac467e757 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -8,7 +8,7 @@
*
* This file is part of the SCTP kernel implementation
*
- * This module provides the abstraction for an SCTP tranport representing
+ * This module provides the abstraction for an SCTP transport representing
* a remote transport address. For local transport addresses, we just use
* union sctp_addr.
*
@@ -123,7 +123,7 @@ void sctp_transport_free(struct sctp_transport *transport)
/* Delete the T3_rtx timer if it's active.
* There is no point in not doing this now and letting
* structure hang around in memory since we know
- * the tranport is going away.
+ * the transport is going away.
*/
if (del_timer(&transport->T3_rtx_timer))
sctp_transport_put(transport);
@@ -133,7 +133,7 @@ void sctp_transport_free(struct sctp_transport *transport)
/* Delete the ICMP proto unreachable timer if it's active. */
if (del_timer(&transport->proto_unreach_timer))
- sctp_association_put(transport->asoc);
+ sctp_transport_put(transport);
sctp_transport_put(transport);
}
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 1c6c640607c5..407fed46931b 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -740,7 +740,7 @@ static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq)
/* Helper function to gather skbs that have possibly become
- * ordered by an an incoming chunk.
+ * ordered by an incoming chunk.
*/
static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
struct sctp_ulpevent *event)
diff --git a/net/smc/Makefile b/net/smc/Makefile
index cb1254541f37..77e54fe42b1c 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -2,4 +2,4 @@
obj-$(CONFIG_SMC) += smc.o
obj-$(CONFIG_SMC_DIAG) += smc_diag.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e7649bbc2b87..47340b3b514f 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -26,6 +26,7 @@
#include <linux/sched/signal.h>
#include <linux/if_vlan.h>
#include <linux/rcupdate_wait.h>
+#include <linux/ctype.h>
#include <net/sock.h>
#include <net/tcp.h>
@@ -44,6 +45,7 @@
#include "smc_ib.h"
#include "smc_ism.h"
#include "smc_pnet.h"
+#include "smc_netlink.h"
#include "smc_tx.h"
#include "smc_rx.h"
#include "smc_close.h"
@@ -55,6 +57,9 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
* creation on client
*/
+struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+struct workqueue_struct *smc_close_wq; /* wq for close work */
+
static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);
@@ -436,26 +441,52 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
static void smcr_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
- int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
+ int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
- smc->conn.peer_rmbe_idx = clc->rmbe_idx;
- smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
+ smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
+ smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
smc->conn.peer_rmbe_size = bufsize;
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
}
+static bool smc_isascii(char *hostname)
+{
+ int i;
+
+ for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
+ if (!isascii(hostname[i]))
+ return false;
+ return true;
+}
+
static void smcd_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
- int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
+ int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
- smc->conn.peer_rmbe_idx = clc->dmbe_idx;
- smc->conn.peer_token = clc->token;
+ smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
+ smc->conn.peer_token = clc->d0.token;
/* msg header takes up space in the buffer */
smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
+ if (clc->hdr.version > SMC_V1 &&
+ (clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) {
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)clc;
+ struct smc_clc_first_contact_ext *fce =
+ (struct smc_clc_first_contact_ext *)
+ (((u8 *)clc_v2) + sizeof(*clc_v2));
+
+ memcpy(smc->conn.lgr->negotiated_eid, clc_v2->eid,
+ SMC_MAX_EID_LEN);
+ smc->conn.lgr->peer_os = fce->os_type;
+ smc->conn.lgr->peer_smc_release = fce->release;
+ if (smc_isascii(fce->hostname))
+ memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
+ SMC_MAX_HOSTNAME_LEN);
+ }
}
static void smc_conn_save_peer_info(struct smc_sock *smc,
@@ -470,11 +501,11 @@ static void smc_conn_save_peer_info(struct smc_sock *smc,
static void smc_link_save_peer_info(struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc)
{
- link->peer_qpn = ntoh24(clc->qpn);
- memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
- memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
- link->peer_psn = ntoh24(clc->psn);
- link->peer_mtu = clc->qp_mtu;
+ link->peer_qpn = ntoh24(clc->r0.qpn);
+ memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac));
+ link->peer_psn = ntoh24(clc->r0.psn);
+ link->peer_mtu = clc->r0.qp_mtu;
}
static void smc_switch_to_fallback(struct smc_sock *smc)
@@ -501,7 +532,8 @@ static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
}
/* decline and fall back during connect */
-static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
+static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
+ u8 version)
{
int rc;
@@ -511,7 +543,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
return reason_code;
}
if (reason_code != SMC_CLC_DECL_PEERDECL) {
- rc = smc_clc_send_decline(smc, reason_code);
+ rc = smc_clc_send_decline(smc, reason_code, version);
if (rc < 0) {
if (smc->sk.sk_state == SMC_INIT)
sock_put(&smc->sk); /* passive closing */
@@ -521,24 +553,12 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
return smc_connect_fallback(smc, reason_code);
}
-/* abort connecting */
-static int smc_connect_abort(struct smc_sock *smc, int reason_code,
- int local_contact)
+static void smc_conn_abort(struct smc_sock *smc, int local_first)
{
- bool is_smcd = smc->conn.lgr->is_smcd;
-
- if (local_contact == SMC_FIRST_CONTACT)
+ if (local_first)
smc_lgr_cleanup_early(&smc->conn);
else
smc_conn_free(&smc->conn);
- if (is_smcd)
- /* there is only one lgr role for SMC-D; use server lock */
- mutex_unlock(&smc_server_lgr_pending);
- else
- mutex_unlock(&smc_client_lgr_pending);
-
- smc->connect_nonblock = 0;
- return reason_code;
}
/* check if there is a rdma device available for this connection. */
@@ -561,47 +581,137 @@ static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
{
/* Find ISM device with same PNETID as connecting interface */
smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
- if (!ini->ism_dev)
+ if (!ini->ism_dev[0])
return SMC_CLC_DECL_NOSMCDDEV;
+ else
+ ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
return 0;
}
+/* is chid unique for the ism devices that are already determined? */
+static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
+ int cnt)
+{
+ int i = (!ini->ism_dev[0]) ? 1 : 0;
+
+ for (; i < cnt; i++)
+ if (ini->ism_chid[i] == chid)
+ return false;
+ return true;
+}
+
+/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
+ * PNETID matching net_device)
+ */
+static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ int rc = SMC_CLC_DECL_NOSMCDDEV;
+ struct smcd_dev *smcd;
+ int i = 1;
+ u16 chid;
+
+ if (smcd_indicated(ini->smc_type_v1))
+ rc = 0; /* already initialized for V1 */
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->going_away || smcd == ini->ism_dev[0])
+ continue;
+ chid = smc_ism_get_chid(smcd);
+ if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
+ continue;
+ if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
+ smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
+ ini->ism_dev[i] = smcd;
+ ini->ism_chid[i] = chid;
+ ini->is_smcd = true;
+ rc = 0;
+ i++;
+ if (i > SMC_MAX_ISM_DEVS)
+ break;
+ }
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+ ini->ism_offered_cnt = i - 1;
+ if (!ini->ism_dev[0] && !ini->ism_dev[1])
+ ini->smcd_version = 0;
+
+ return rc;
+}
+
/* Check for VLAN ID and register it on ISM device just for CLC handshake */
static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
struct smc_init_info *ini)
{
- if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
+ if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
return SMC_CLC_DECL_ISMVLANERR;
return 0;
}
+static int smc_find_proposal_devices(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ int rc = 0;
+
+ /* check if there is an ism device available */
+ if (ini->smcd_version & SMC_V1) {
+ if (smc_find_ism_device(smc, ini) ||
+ smc_connect_ism_vlan_setup(smc, ini)) {
+ if (ini->smc_type_v1 == SMC_TYPE_B)
+ ini->smc_type_v1 = SMC_TYPE_R;
+ else
+ ini->smc_type_v1 = SMC_TYPE_N;
+ } /* else ISM V1 is supported for this connection */
+ if (smc_find_rdma_device(smc, ini)) {
+ if (ini->smc_type_v1 == SMC_TYPE_B)
+ ini->smc_type_v1 = SMC_TYPE_D;
+ else
+ ini->smc_type_v1 = SMC_TYPE_N;
+ } /* else RDMA is supported for this connection */
+ }
+ if (smc_ism_is_v2_capable() && smc_find_ism_v2_device_clnt(smc, ini))
+ ini->smc_type_v2 = SMC_TYPE_N;
+
+ /* if neither ISM nor RDMA are supported, fallback */
+ if (!smcr_indicated(ini->smc_type_v1) &&
+ ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
+ rc = SMC_CLC_DECL_NOSMCDEV;
+
+ return rc;
+}
+
/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
* used, the VLAN ID will be registered again during the connection setup.
*/
-static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
+static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
struct smc_init_info *ini)
{
- if (!is_smcd)
+ if (!smcd_indicated(ini->smc_type_v1))
return 0;
- if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
+ if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
return SMC_CLC_DECL_CNFERR;
return 0;
}
+#define SMC_CLC_MAX_ACCEPT_LEN \
+ (sizeof(struct smc_clc_msg_accept_confirm_v2) + \
+ sizeof(struct smc_clc_first_contact_ext) + \
+ sizeof(struct smc_clc_msg_trail))
+
/* CLC handshake during connect */
-static int smc_connect_clc(struct smc_sock *smc, int smc_type,
- struct smc_clc_msg_accept_confirm *aclc,
+static int smc_connect_clc(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm_v2 *aclc2,
struct smc_init_info *ini)
{
int rc = 0;
/* do inband token exchange */
- rc = smc_clc_send_proposal(smc, smc_type, ini);
+ rc = smc_clc_send_proposal(smc, ini);
if (rc)
return rc;
/* receive SMC Accept CLC message */
- return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
- CLC_WAIT_TIME);
+ return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
+ SMC_CLC_ACCEPT, CLC_WAIT_TIME);
}
/* setup for RDMA connection of client */
@@ -613,9 +723,9 @@ static int smc_connect_rdma(struct smc_sock *smc,
struct smc_link *link;
ini->is_smcd = false;
- ini->ib_lcl = &aclc->lcl;
- ini->ib_clcqpn = ntoh24(aclc->qpn);
- ini->srv_first_contact = aclc->hdr.flag;
+ ini->ib_lcl = &aclc->r0.lcl;
+ ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
+ ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
mutex_lock(&smc_client_lgr_pending);
reason_code = smc_conn_create(smc, ini);
@@ -626,7 +736,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
smc_conn_save_peer_info(smc, aclc);
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (ini->first_contact_local) {
link = smc->conn.lnk;
} else {
/* set link that was assigned by server */
@@ -634,60 +744,66 @@ static int smc_connect_rdma(struct smc_sock *smc,
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
struct smc_link *l = &smc->conn.lgr->lnk[i];
- if (l->peer_qpn == ntoh24(aclc->qpn) &&
- !memcmp(l->peer_gid, &aclc->lcl.gid, SMC_GID_SIZE) &&
- !memcmp(l->peer_mac, &aclc->lcl.mac, sizeof(l->peer_mac))) {
+ if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
+ !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
+ SMC_GID_SIZE) &&
+ !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
+ sizeof(l->peer_mac))) {
link = l;
break;
}
}
- if (!link)
- return smc_connect_abort(smc, SMC_CLC_DECL_NOSRVLINK,
- ini->cln_first_contact);
+ if (!link) {
+ reason_code = SMC_CLC_DECL_NOSRVLINK;
+ goto connect_abort;
+ }
smc->conn.lnk = link;
}
/* create send buffer and rmb */
- if (smc_buf_create(smc, false))
- return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
- ini->cln_first_contact);
+ if (smc_buf_create(smc, false)) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto connect_abort;
+ }
- if (ini->cln_first_contact == SMC_FIRST_CONTACT)
+ if (ini->first_contact_local)
smc_link_save_peer_info(link, aclc);
- if (smc_rmb_rtoken_handling(&smc->conn, link, aclc))
- return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
- ini->cln_first_contact);
+ if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
+ reason_code = SMC_CLC_DECL_ERR_RTOK;
+ goto connect_abort;
+ }
smc_close_init(smc);
smc_rx_init(smc);
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
- if (smc_ib_ready_link(link))
- return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
- ini->cln_first_contact);
+ if (ini->first_contact_local) {
+ if (smc_ib_ready_link(link)) {
+ reason_code = SMC_CLC_DECL_ERR_RDYLNK;
+ goto connect_abort;
+ }
} else {
- if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc))
- return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
- ini->cln_first_contact);
+ if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
+ reason_code = SMC_CLC_DECL_ERR_REGRMB;
+ goto connect_abort;
+ }
}
smc_rmb_sync_sg_for_device(&smc->conn);
- reason_code = smc_clc_send_confirm(smc);
+ reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
+ SMC_V1);
if (reason_code)
- return smc_connect_abort(smc, reason_code,
- ini->cln_first_contact);
+ goto connect_abort;
smc_tx_init(smc);
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (ini->first_contact_local) {
/* QP confirmation over RoCE fabric */
smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
reason_code = smcr_clnt_conf_first_link(smc);
smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
if (reason_code)
- return smc_connect_abort(smc, reason_code,
- ini->cln_first_contact);
+ goto connect_abort;
}
mutex_unlock(&smc_client_lgr_pending);
@@ -697,6 +813,31 @@ static int smc_connect_rdma(struct smc_sock *smc,
smc->sk.sk_state = SMC_ACTIVE;
return 0;
+connect_abort:
+ smc_conn_abort(smc, ini->first_contact_local);
+ mutex_unlock(&smc_client_lgr_pending);
+ smc->connect_nonblock = 0;
+
+ return reason_code;
+}
+
+/* The server has chosen one of the proposed ISM devices for the communication.
+ * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
+ */
+static int
+smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
+ struct smc_init_info *ini)
+{
+ int i;
+
+ for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
+ if (ini->ism_chid[i] == ntohs(aclc->chid)) {
+ ini->ism_selected = i;
+ return 0;
+ }
+ }
+
+ return -EPROTO;
}
/* setup for ISM connection of client */
@@ -707,8 +848,17 @@ static int smc_connect_ism(struct smc_sock *smc,
int rc = 0;
ini->is_smcd = true;
- ini->ism_gid = aclc->gid;
- ini->srv_first_contact = aclc->hdr.flag;
+ ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+
+ if (aclc->hdr.version == SMC_V2) {
+ struct smc_clc_msg_accept_confirm_v2 *aclc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+ rc = smc_v2_determine_accepted_chid(aclc_v2, ini);
+ if (rc)
+ return rc;
+ }
+ ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid;
/* there is only one lgr role for SMC-D; use server lock */
mutex_lock(&smc_server_lgr_pending);
@@ -720,20 +870,20 @@ static int smc_connect_ism(struct smc_sock *smc,
/* Create send and receive buffers */
rc = smc_buf_create(smc, true);
- if (rc)
- return smc_connect_abort(smc, (rc == -ENOSPC) ?
- SMC_CLC_DECL_MAX_DMB :
- SMC_CLC_DECL_MEM,
- ini->cln_first_contact);
+ if (rc) {
+ rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
+ goto connect_abort;
+ }
smc_conn_save_peer_info(smc, aclc);
smc_close_init(smc);
smc_rx_init(smc);
smc_tx_init(smc);
- rc = smc_clc_send_confirm(smc);
+ rc = smc_clc_send_confirm(smc, ini->first_contact_local,
+ aclc->hdr.version);
if (rc)
- return smc_connect_abort(smc, rc, ini->cln_first_contact);
+ goto connect_abort;
mutex_unlock(&smc_server_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
@@ -742,15 +892,40 @@ static int smc_connect_ism(struct smc_sock *smc,
smc->sk.sk_state = SMC_ACTIVE;
return 0;
+connect_abort:
+ smc_conn_abort(smc, ini->first_contact_local);
+ mutex_unlock(&smc_server_lgr_pending);
+ smc->connect_nonblock = 0;
+
+ return rc;
+}
+
+/* check if received accept type and version matches a proposed one */
+static int smc_connect_check_aclc(struct smc_init_info *ini,
+ struct smc_clc_msg_accept_confirm *aclc)
+{
+ if ((aclc->hdr.typev1 == SMC_TYPE_R &&
+ !smcr_indicated(ini->smc_type_v1)) ||
+ (aclc->hdr.typev1 == SMC_TYPE_D &&
+ ((!smcd_indicated(ini->smc_type_v1) &&
+ !smcd_indicated(ini->smc_type_v2)) ||
+ (aclc->hdr.version == SMC_V1 &&
+ !smcd_indicated(ini->smc_type_v1)) ||
+ (aclc->hdr.version == SMC_V2 &&
+ !smcd_indicated(ini->smc_type_v2)))))
+ return SMC_CLC_DECL_MODEUNSUPP;
+
+ return 0;
}
/* perform steps before actually connecting */
static int __smc_connect(struct smc_sock *smc)
{
- bool ism_supported = false, rdma_supported = false;
- struct smc_clc_msg_accept_confirm aclc;
- struct smc_init_info ini = {0};
- int smc_type;
+ u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1;
+ struct smc_clc_msg_accept_confirm_v2 *aclc2;
+ struct smc_clc_msg_accept_confirm *aclc;
+ struct smc_init_info *ini = NULL;
+ u8 *buf = NULL;
int rc = 0;
if (smc->use_fallback)
@@ -760,58 +935,74 @@ static int __smc_connect(struct smc_sock *smc)
if (!tcp_sk(smc->clcsock->sk)->syn_smc)
return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
- /* IPSec connections opt out of SMC-R optimizations */
+ /* IPSec connections opt out of SMC optimizations */
if (using_ipsec(smc))
- return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
+ version);
- /* get vlan id from IP device */
- if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
- return smc_connect_decline_fallback(smc,
- SMC_CLC_DECL_GETVLANERR);
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini)
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
+ version);
- /* check if there is an ism device available */
- if (!smc_find_ism_device(smc, &ini) &&
- !smc_connect_ism_vlan_setup(smc, &ini)) {
- /* ISM is supported for this connection */
- ism_supported = true;
- smc_type = SMC_TYPE_D;
- }
-
- /* check if there is a rdma device available */
- if (!smc_find_rdma_device(smc, &ini)) {
- /* RDMA is supported for this connection */
- rdma_supported = true;
- if (ism_supported)
- smc_type = SMC_TYPE_B; /* both */
- else
- smc_type = SMC_TYPE_R; /* only RDMA */
+ ini->smcd_version = SMC_V1;
+ ini->smcd_version |= smc_ism_is_v2_capable() ? SMC_V2 : 0;
+ ini->smc_type_v1 = SMC_TYPE_B;
+ ini->smc_type_v2 = smc_ism_is_v2_capable() ? SMC_TYPE_D : SMC_TYPE_N;
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
+ ini->smcd_version &= ~SMC_V1;
+ ini->smc_type_v1 = SMC_TYPE_N;
+ if (!ini->smcd_version) {
+ rc = SMC_CLC_DECL_GETVLANERR;
+ goto fallback;
+ }
}
- /* if neither ISM nor RDMA are supported, fallback */
- if (!rdma_supported && !ism_supported)
- return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
+ rc = smc_find_proposal_devices(smc, ini);
+ if (rc)
+ goto fallback;
- /* perform CLC handshake */
- rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
- if (rc) {
- smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
- return smc_connect_decline_fallback(smc, rc);
+ buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto fallback;
}
+ aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf;
+ aclc = (struct smc_clc_msg_accept_confirm *)aclc2;
+
+ /* perform CLC handshake */
+ rc = smc_connect_clc(smc, aclc2, ini);
+ if (rc)
+ goto vlan_cleanup;
+
+ /* check if smc modes and versions of CLC proposal and accept match */
+ rc = smc_connect_check_aclc(ini, aclc);
+ version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
+ ini->smcd_version = version;
+ if (rc)
+ goto vlan_cleanup;
/* depending on previous steps, connect using rdma or ism */
- if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
- rc = smc_connect_rdma(smc, &aclc, &ini);
- else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
- rc = smc_connect_ism(smc, &aclc, &ini);
- else
- rc = SMC_CLC_DECL_MODEUNSUPP;
- if (rc) {
- smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
- return smc_connect_decline_fallback(smc, rc);
- }
+ if (aclc->hdr.typev1 == SMC_TYPE_R)
+ rc = smc_connect_rdma(smc, aclc, ini);
+ else if (aclc->hdr.typev1 == SMC_TYPE_D)
+ rc = smc_connect_ism(smc, aclc, ini);
+ if (rc)
+ goto vlan_cleanup;
- smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
+ smc_connect_ism_vlan_cleanup(smc, ini);
+ kfree(buf);
+ kfree(ini);
return 0;
+
+vlan_cleanup:
+ smc_connect_ism_vlan_cleanup(smc, ini);
+ kfree(buf);
+fallback:
+ kfree(ini);
+ return smc_connect_decline_fallback(smc, rc, version);
}
static void smc_connect_work(struct work_struct *work)
@@ -903,7 +1094,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
if (smc->use_fallback)
goto out;
if (flags & O_NONBLOCK) {
- if (schedule_work(&smc->connect_work))
+ if (queue_work(smc_hs_wq, &smc->connect_work))
smc->connect_nonblock = 1;
rc = -EINPROGRESS;
} else {
@@ -940,10 +1131,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
mutex_lock(&lsmc->clcsock_release_lock);
if (lsmc->clcsock)
- rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
mutex_unlock(&lsmc->clcsock_release_lock);
lock_sock(lsk);
- if (rc < 0)
+ if (rc < 0 && rc != -EAGAIN)
lsk->sk_err = -rc;
if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
new_sk->sk_prot->unhash(new_sk);
@@ -956,6 +1147,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
goto out;
}
+ /* new clcsock has inherited the smc listen-specific sk_data_ready
+ * function; switch it back to the original sk_data_ready function
+ */
+ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
(*new_smc)->clcsock = new_clcsock;
out:
return rc;
@@ -1123,13 +1318,10 @@ static void smc_listen_out_err(struct smc_sock *new_smc)
/* listen worker: decline and fall back if possible */
static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
- int local_contact)
+ int local_first, u8 version)
{
/* RDMA setup failed, switch back to TCP */
- if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_cleanup_early(&new_smc->conn);
- else
- smc_conn_free(&new_smc->conn);
+ smc_conn_abort(new_smc, local_first);
if (reason_code < 0) { /* error, no fallback possible */
smc_listen_out_err(new_smc);
return;
@@ -1137,7 +1329,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
smc_switch_to_fallback(new_smc);
new_smc->fallback_rsn = reason_code;
if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
- if (smc_clc_send_decline(new_smc, reason_code) < 0) {
+ if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
smc_listen_out_err(new_smc);
return;
}
@@ -1145,6 +1337,49 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
smc_listen_out_connected(new_smc);
}
+/* listen worker: version checking */
+static int smc_listen_v2_check(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
+ struct smc_clc_v2_extension *pclc_v2_ext;
+ int rc = SMC_CLC_DECL_PEERNOSMC;
+
+ ini->smc_type_v1 = pclc->hdr.typev1;
+ ini->smc_type_v2 = pclc->hdr.typev2;
+ ini->smcd_version = ini->smc_type_v1 != SMC_TYPE_N ? SMC_V1 : 0;
+ if (pclc->hdr.version > SMC_V1)
+ ini->smcd_version |=
+ ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0;
+ if (!(ini->smcd_version & SMC_V2)) {
+ rc = SMC_CLC_DECL_PEERNOSMC;
+ goto out;
+ }
+ if (!smc_ism_is_v2_capable()) {
+ ini->smcd_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOISM2SUPP;
+ goto out;
+ }
+ pclc_v2_ext = smc_get_clc_v2_ext(pclc);
+ if (!pclc_v2_ext) {
+ ini->smcd_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOV2EXT;
+ goto out;
+ }
+ pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
+ if (!pclc_smcd_v2_ext) {
+ ini->smcd_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOV2DEXT;
+ }
+
+out:
+ if (!ini->smcd_version)
+ return rc;
+
+ return 0;
+}
+
/* listen worker: check prefixes */
static int smc_listen_prfx_check(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc)
@@ -1152,6 +1387,8 @@ static int smc_listen_prfx_check(struct smc_sock *new_smc,
struct smc_clc_msg_proposal_prefix *pclc_prfx;
struct socket *newclcsock = new_smc->clcsock;
+ if (pclc->hdr.typev1 == SMC_TYPE_N)
+ return 0;
pclc_prfx = smc_clc_proposal_get_prefix(pclc);
if (smc_clc_prfx_match(newclcsock, pclc_prfx))
return SMC_CLC_DECL_DIFFPREFIX;
@@ -1179,36 +1416,18 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc,
/* listen worker: initialize connection and buffers for SMC-D */
static int smc_listen_ism_init(struct smc_sock *new_smc,
- struct smc_clc_msg_proposal *pclc,
struct smc_init_info *ini)
{
- struct smc_clc_msg_smcd *pclc_smcd;
int rc;
- pclc_smcd = smc_get_clc_msg_smcd(pclc);
- ini->ism_gid = pclc_smcd->gid;
rc = smc_conn_create(new_smc, ini);
if (rc)
return rc;
- /* Check if peer can be reached via ISM device */
- if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
- new_smc->conn.lgr->vlan_id,
- new_smc->conn.lgr->smcd)) {
- if (ini->cln_first_contact == SMC_FIRST_CONTACT)
- smc_lgr_cleanup_early(&new_smc->conn);
- else
- smc_conn_free(&new_smc->conn);
- return SMC_CLC_DECL_SMCDNOTALK;
- }
-
/* Create send and receive buffers */
rc = smc_buf_create(new_smc, true);
if (rc) {
- if (ini->cln_first_contact == SMC_FIRST_CONTACT)
- smc_lgr_cleanup_early(&new_smc->conn);
- else
- smc_conn_free(&new_smc->conn);
+ smc_conn_abort(new_smc, ini->first_contact_local);
return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
SMC_CLC_DECL_MEM;
}
@@ -1216,12 +1435,150 @@ static int smc_listen_ism_init(struct smc_sock *new_smc,
return 0;
}
+static bool smc_is_already_selected(struct smcd_dev *smcd,
+ struct smc_init_info *ini,
+ int matches)
+{
+ int i;
+
+ for (i = 0; i < matches; i++)
+ if (smcd == ini->ism_dev[i])
+ return true;
+
+ return false;
+}
+
+/* check for ISM devices matching proposed ISM devices */
+static void smc_check_ism_v2_match(struct smc_init_info *ini,
+ u16 proposed_chid, u64 proposed_gid,
+ unsigned int *matches)
+{
+ struct smcd_dev *smcd;
+
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->going_away)
+ continue;
+ if (smc_is_already_selected(smcd, ini, *matches))
+ continue;
+ if (smc_ism_get_chid(smcd) == proposed_chid &&
+ !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
+ ini->ism_peer_gid[*matches] = proposed_gid;
+ ini->ism_dev[*matches] = smcd;
+ (*matches)++;
+ break;
+ }
+ }
+}
+
+static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini)
+{
+ if (!ini->rc)
+ ini->rc = rc;
+}
+
+static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_v2_extension *smc_v2_ext;
+ struct smc_clc_msg_smcd *pclc_smcd;
+ unsigned int matches = 0;
+ u8 smcd_version;
+ u8 *eid = NULL;
+ int i, rc;
+
+ if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
+ goto not_found;
+
+ pclc_smcd = smc_get_clc_msg_smcd(pclc);
+ smc_v2_ext = smc_get_clc_v2_ext(pclc);
+ smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
+ if (!smcd_v2_ext ||
+ !smc_v2_ext->hdr.flag.seid) { /* no system EID support for SMCD */
+ smc_find_ism_store_rc(SMC_CLC_DECL_NOSEID, ini);
+ goto not_found;
+ }
+
+ mutex_lock(&smcd_dev_list.mutex);
+ if (pclc_smcd->ism.chid)
+ /* check for ISM device matching proposed native ISM device */
+ smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
+ ntohll(pclc_smcd->ism.gid), &matches);
+ for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) {
+ /* check for ISM devices matching proposed non-native ISM
+ * devices
+ */
+ smc_check_ism_v2_match(ini,
+ ntohs(smcd_v2_ext->gidchid[i - 1].chid),
+ ntohll(smcd_v2_ext->gidchid[i - 1].gid),
+ &matches);
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+
+ if (ini->ism_dev[0]) {
+ smc_ism_get_system_eid(ini->ism_dev[0], &eid);
+ if (memcmp(eid, smcd_v2_ext->system_eid, SMC_MAX_EID_LEN))
+ goto not_found;
+ } else {
+ goto not_found;
+ }
+
+ /* separate - outside the smcd_dev_list.lock */
+ smcd_version = ini->smcd_version;
+ for (i = 0; i < matches; i++) {
+ ini->smcd_version = SMC_V2;
+ ini->is_smcd = true;
+ ini->ism_selected = i;
+ rc = smc_listen_ism_init(new_smc, ini);
+ if (rc) {
+ smc_find_ism_store_rc(rc, ini);
+ /* try next active ISM device */
+ continue;
+ }
+ return; /* matching and usable V2 ISM device found */
+ }
+ /* no V2 ISM device could be initialized */
+ ini->smcd_version = smcd_version; /* restore original value */
+
+not_found:
+ ini->smcd_version &= ~SMC_V2;
+ ini->ism_dev[0] = NULL;
+ ini->is_smcd = false;
+}
+
+static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
+ int rc = 0;
+
+ /* check if ISM V1 is available */
+ if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
+ goto not_found;
+ ini->is_smcd = true; /* prepare ISM check */
+ ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
+ rc = smc_find_ism_device(new_smc, ini);
+ if (rc)
+ goto not_found;
+ ini->ism_selected = 0;
+ rc = smc_listen_ism_init(new_smc, ini);
+ if (!rc)
+ return; /* V1 ISM device found */
+
+not_found:
+ smc_find_ism_store_rc(rc, ini);
+ ini->ism_dev[0] = NULL;
+ ini->is_smcd = false;
+}
+
/* listen worker: register buffers */
-static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
+static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
{
struct smc_connection *conn = &new_smc->conn;
- if (local_contact != SMC_FIRST_CONTACT) {
+ if (!local_first) {
if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
return SMC_CLC_DECL_ERR_REGRMB;
}
@@ -1230,52 +1587,107 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
return 0;
}
+static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ if (!smcr_indicated(ini->smc_type_v1))
+ return SMC_CLC_DECL_NOSMCDEV;
+
+ /* prepare RDMA check */
+ ini->ib_lcl = &pclc->lcl;
+ rc = smc_find_rdma_device(new_smc, ini);
+ if (rc) {
+ /* no RDMA device found */
+ if (ini->smc_type_v1 == SMC_TYPE_B)
+ /* neither ISM nor RDMA device found */
+ rc = SMC_CLC_DECL_NOSMCDEV;
+ return rc;
+ }
+ rc = smc_listen_rdma_init(new_smc, ini);
+ if (rc)
+ return rc;
+ return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
+}
+
+/* determine the local device matching to proposal */
+static int smc_listen_find_device(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ /* check for ISM device matching V2 proposed device */
+ smc_find_ism_v2_device_serv(new_smc, pclc, ini);
+ if (ini->ism_dev[0])
+ return 0;
+
+ if (!(ini->smcd_version & SMC_V1))
+ return ini->rc ?: SMC_CLC_DECL_NOSMCD2DEV;
+
+ /* check for matching IP prefix and subnet length */
+ rc = smc_listen_prfx_check(new_smc, pclc);
+ if (rc)
+ return ini->rc ?: rc;
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
+ return ini->rc ?: SMC_CLC_DECL_GETVLANERR;
+
+ /* check for ISM device matching V1 proposed device */
+ smc_find_ism_v1_device_serv(new_smc, pclc, ini);
+ if (ini->ism_dev[0])
+ return 0;
+
+ if (pclc->hdr.typev1 == SMC_TYPE_D)
+ /* skip RDMA and decline */
+ return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV;
+
+ /* check if RDMA is available */
+ rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
+ smc_find_ism_store_rc(rc, ini);
+
+ return (!rc) ? 0 : ini->rc;
+}
+
/* listen worker: finish RDMA setup */
static int smc_listen_rdma_finish(struct smc_sock *new_smc,
struct smc_clc_msg_accept_confirm *cclc,
- int local_contact)
+ bool local_first)
{
struct smc_link *link = new_smc->conn.lnk;
int reason_code = 0;
- if (local_contact == SMC_FIRST_CONTACT)
+ if (local_first)
smc_link_save_peer_info(link, cclc);
- if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) {
- reason_code = SMC_CLC_DECL_ERR_RTOK;
- goto decline;
- }
+ if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
+ return SMC_CLC_DECL_ERR_RTOK;
- if (local_contact == SMC_FIRST_CONTACT) {
- if (smc_ib_ready_link(link)) {
- reason_code = SMC_CLC_DECL_ERR_RDYLNK;
- goto decline;
- }
+ if (local_first) {
+ if (smc_ib_ready_link(link))
+ return SMC_CLC_DECL_ERR_RDYLNK;
/* QP confirmation over RoCE fabric */
smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
reason_code = smcr_serv_conf_first_link(new_smc);
smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
- if (reason_code)
- goto decline;
}
- return 0;
-
-decline:
- smc_listen_decline(new_smc, reason_code, local_contact);
return reason_code;
}
-/* setup for RDMA connection of server */
+/* setup for connection of server */
static void smc_listen_work(struct work_struct *work)
{
struct smc_sock *new_smc = container_of(work, struct smc_sock,
smc_listen_work);
+ u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1;
struct socket *newclcsock = new_smc->clcsock;
- struct smc_clc_msg_accept_confirm cclc;
+ struct smc_clc_msg_accept_confirm *cclc;
+ struct smc_clc_msg_proposal_area *buf;
struct smc_clc_msg_proposal *pclc;
- struct smc_init_info ini = {0};
- bool ism_supported = false;
- u8 buf[SMC_CLC_MAX_LEN];
+ struct smc_init_info *ini = NULL;
int rc = 0;
if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
@@ -1297,102 +1709,86 @@ static void smc_listen_work(struct work_struct *work)
/* do inband token exchange -
* wait for and receive SMC Proposal CLC message
*/
- pclc = (struct smc_clc_msg_proposal *)&buf;
- rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto out_decl;
+ }
+ pclc = (struct smc_clc_msg_proposal *)buf;
+ rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
if (rc)
goto out_decl;
+ version = pclc->hdr.version == SMC_V1 ? SMC_V1 : version;
- /* IPSec connections opt out of SMC-R optimizations */
+ /* IPSec connections opt out of SMC optimizations */
if (using_ipsec(new_smc)) {
rc = SMC_CLC_DECL_IPSEC;
goto out_decl;
}
- /* check for matching IP prefix and subnet length */
- rc = smc_listen_prfx_check(new_smc, pclc);
- if (rc)
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini) {
+ rc = SMC_CLC_DECL_MEM;
goto out_decl;
+ }
- /* get vlan id from IP device */
- if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
- rc = SMC_CLC_DECL_GETVLANERR;
+ /* initial version checking */
+ rc = smc_listen_v2_check(new_smc, pclc, ini);
+ if (rc)
goto out_decl;
- }
mutex_lock(&smc_server_lgr_pending);
smc_close_init(new_smc);
smc_rx_init(new_smc);
smc_tx_init(new_smc);
- /* check if ISM is available */
- if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
- ini.is_smcd = true; /* prepare ISM check */
- rc = smc_find_ism_device(new_smc, &ini);
- if (!rc)
- rc = smc_listen_ism_init(new_smc, pclc, &ini);
- if (!rc)
- ism_supported = true;
- else if (pclc->hdr.path == SMC_TYPE_D)
- goto out_unlock; /* skip RDMA and decline */
- }
-
- /* check if RDMA is available */
- if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
- /* prepare RDMA check */
- ini.is_smcd = false;
- ini.ism_dev = NULL;
- ini.ib_lcl = &pclc->lcl;
- rc = smc_find_rdma_device(new_smc, &ini);
- if (rc) {
- /* no RDMA device found */
- if (pclc->hdr.path == SMC_TYPE_B)
- /* neither ISM nor RDMA device found */
- rc = SMC_CLC_DECL_NOSMCDEV;
- goto out_unlock;
- }
- rc = smc_listen_rdma_init(new_smc, &ini);
- if (rc)
- goto out_unlock;
- rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
- if (rc)
- goto out_unlock;
- }
+ /* determine ISM or RoCE device used for connection */
+ rc = smc_listen_find_device(new_smc, pclc, ini);
+ if (rc)
+ goto out_unlock;
/* send SMC Accept CLC message */
- rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
+ rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
+ ini->smcd_version == SMC_V2 ? SMC_V2 : SMC_V1);
if (rc)
goto out_unlock;
/* SMC-D does not need this lock any more */
- if (ism_supported)
+ if (ini->is_smcd)
mutex_unlock(&smc_server_lgr_pending);
/* receive SMC Confirm CLC message */
- rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+ memset(buf, 0, sizeof(*buf));
+ cclc = (struct smc_clc_msg_accept_confirm *)buf;
+ rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
SMC_CLC_CONFIRM, CLC_WAIT_TIME);
if (rc) {
- if (!ism_supported)
+ if (!ini->is_smcd)
goto out_unlock;
goto out_decl;
}
/* finish worker */
- if (!ism_supported) {
- rc = smc_listen_rdma_finish(new_smc, &cclc,
- ini.cln_first_contact);
- mutex_unlock(&smc_server_lgr_pending);
+ if (!ini->is_smcd) {
+ rc = smc_listen_rdma_finish(new_smc, cclc,
+ ini->first_contact_local);
if (rc)
- return;
+ goto out_unlock;
+ mutex_unlock(&smc_server_lgr_pending);
}
- smc_conn_save_peer_info(new_smc, &cclc);
+ smc_conn_save_peer_info(new_smc, cclc);
smc_listen_out_connected(new_smc);
- return;
+ goto out_free;
out_unlock:
mutex_unlock(&smc_server_lgr_pending);
out_decl:
- smc_listen_decline(new_smc, rc, ini.cln_first_contact);
+ smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
+ version);
+out_free:
+ kfree(ini);
+ kfree(buf);
}
static void smc_tcp_listen_work(struct work_struct *work)
@@ -1406,7 +1802,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
lock_sock(lsk);
while (lsk->sk_state == SMC_LISTEN) {
rc = smc_clcsock_accept(lsmc, &new_smc);
- if (rc)
+ if (rc) /* clcsock accept queue empty or error */
goto out;
if (!new_smc)
continue;
@@ -1420,13 +1816,29 @@ static void smc_tcp_listen_work(struct work_struct *work)
new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
sock_hold(&new_smc->sk); /* sock_put in passive closing */
- if (!schedule_work(&new_smc->smc_listen_work))
+ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
sock_put(&new_smc->sk);
}
out:
release_sock(lsk);
- sock_put(&lsmc->sk); /* sock_hold in smc_listen */
+ sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
+}
+
+static void smc_clcsock_data_ready(struct sock *listen_clcsock)
+{
+ struct smc_sock *lsmc;
+
+ lsmc = (struct smc_sock *)
+ ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY);
+ if (!lsmc)
+ return;
+ lsmc->clcsk_data_ready(listen_clcsock);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
+ if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work))
+ sock_put(&lsmc->sk);
+ }
}
static int smc_listen(struct socket *sock, int backlog)
@@ -1455,15 +1867,19 @@ static int smc_listen(struct socket *sock, int backlog)
if (!smc->use_fallback)
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+ /* save original sk_data_ready function and establish
+ * smc-specific sk_data_ready function
+ */
+ smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready;
+ smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready;
+ smc->clcsock->sk->sk_user_data =
+ (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
rc = kernel_listen(smc->clcsock, backlog);
if (rc)
goto out;
sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
sk->sk_state = SMC_LISTEN;
- sock_hold(sk); /* sock_hold in tcp_listen_worker */
- if (!schedule_work(&smc->tcp_listen_work))
- sock_put(sk);
out:
release_sock(sk);
@@ -1788,8 +2204,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
if (val)
- mod_delayed_work(system_wq, &smc->conn.tx_work,
- 0);
+ mod_delayed_work(smc->conn.lgr->tx_wq,
+ &smc->conn.tx_work, 0);
}
break;
case TCP_CORK:
@@ -1797,8 +2213,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
if (!val)
- mod_delayed_work(system_wq, &smc->conn.tx_work,
- 0);
+ mod_delayed_work(smc->conn.lgr->tx_wq,
+ &smc->conn.tx_work, 0);
}
break;
case TCP_DEFER_ACCEPT:
@@ -2077,14 +2493,30 @@ static int __init smc_init(void)
if (rc)
return rc;
- rc = smc_pnet_init();
+ smc_ism_init();
+ smc_clc_init();
+
+ rc = smc_nl_init();
if (rc)
goto out_pernet_subsys;
+ rc = smc_pnet_init();
+ if (rc)
+ goto out_nl;
+
+ rc = -ENOMEM;
+ smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
+ if (!smc_hs_wq)
+ goto out_pnet;
+
+ smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
+ if (!smc_close_wq)
+ goto out_alloc_hs_wq;
+
rc = smc_core_init();
if (rc) {
pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_alloc_wqs;
}
rc = smc_llc_init();
@@ -2136,8 +2568,14 @@ out_proto:
proto_unregister(&smc_proto);
out_core:
smc_core_exit();
+out_alloc_wqs:
+ destroy_workqueue(smc_close_wq);
+out_alloc_hs_wq:
+ destroy_workqueue(smc_hs_wq);
out_pnet:
smc_pnet_exit();
+out_nl:
+ smc_nl_exit();
out_pernet_subsys:
unregister_pernet_subsys(&smc_net_ops);
@@ -2150,9 +2588,12 @@ static void __exit smc_exit(void)
sock_unregister(PF_SMC);
smc_core_exit();
smc_ib_unregister_client();
+ destroy_workqueue(smc_close_wq);
+ destroy_workqueue(smc_hs_wq);
proto_unregister(&smc_proto6);
proto_unregister(&smc_proto);
smc_pnet_exit();
+ smc_nl_exit();
unregister_pernet_subsys(&smc_net_ops);
rcu_barrier();
}
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 6f1c42da7a4c..d65e15f0c944 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -18,9 +18,20 @@
#include "smc_ib.h"
+#define SMC_V1 1 /* SMC version V1 */
+#define SMC_V2 2 /* SMC version V2 */
+#define SMC_RELEASE 0
+
#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */
#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */
+#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM
+ * devices
+ */
+
+#define SMC_MAX_HOSTNAME_LEN 32
+#define SMC_MAX_EID_LEN 32
+
extern struct proto smc_proto;
extern struct proto smc_proto6;
@@ -201,6 +212,8 @@ struct smc_connection {
struct smc_sock { /* smc sock container */
struct sock sk;
struct socket *clcsock; /* internal tcp socket */
+ void (*clcsk_data_ready)(struct sock *sk);
+ /* original data_ready fct. **/
struct smc_connection conn; /* smc connection */
struct smc_sock *listen_smc; /* listen parent */
struct work_struct connect_work; /* handle non-blocking connect*/
@@ -235,10 +248,16 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
return (struct smc_sock *)sk;
}
+extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+extern struct workqueue_struct *smc_close_wq; /* wq for close work */
+
#define SMC_SYSTEMID_LEN 8
extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+#define ntohll(x) be64_to_cpu(x)
+#define htonll(x) cpu_to_be64(x)
+
/* convert an u32 value into network byte order, store it into a 3 byte field */
static inline void hton24(u8 *net, u32 host)
{
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index ce468ff62a19..f23f558054a7 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -299,7 +299,7 @@ static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc,
conn->lnk = link;
spin_unlock_bh(&conn->send_lock);
sock_hold(&smc->sk); /* sock_put in abort_work */
- if (!schedule_work(&conn->abort_work))
+ if (!queue_work(smc_close_wq, &conn->abort_work))
sock_put(&smc->sk);
}
}
@@ -368,7 +368,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(&smc->sk, SOCK_DONE);
sock_hold(&smc->sk); /* sock_put in close_work */
- if (!schedule_work(&conn->close_work))
+ if (!queue_work(smc_close_wq, &conn->close_work))
sock_put(&smc->sk);
}
}
@@ -389,9 +389,9 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
* Context:
* - tasklet context
*/
-static void smcd_cdc_rx_tsklet(unsigned long data)
+static void smcd_cdc_rx_tsklet(struct tasklet_struct *t)
{
- struct smc_connection *conn = (struct smc_connection *)data;
+ struct smc_connection *conn = from_tasklet(conn, t, rx_tsklet);
struct smcd_cdc_msg *data_cdc;
struct smcd_cdc_msg cdc;
struct smc_sock *smc;
@@ -411,7 +411,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data)
*/
void smcd_cdc_rx_init(struct smc_connection *conn)
{
- tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn);
+ tasklet_setup(&conn->rx_tsklet, smcd_cdc_rx_tsklet);
}
/***************************** init, exit, misc ******************************/
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 779f4142a11d..e286dafd6e88 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -14,6 +14,8 @@
#include <linux/inetdevice.h>
#include <linux/if_ether.h>
#include <linux/sched/signal.h>
+#include <linux/utsname.h>
+#include <linux/ctype.h>
#include <net/addrconf.h>
#include <net/sock.h>
@@ -27,6 +29,7 @@
#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68
#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48
+#define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78
#define SMC_CLC_RECV_BUF_LEN 100
/* eye catcher "SMCR" EBCDIC for CLC messages */
@@ -34,13 +37,88 @@ static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
/* eye catcher "SMCD" EBCDIC for CLC messages */
static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'};
+static u8 smc_hostname[SMC_MAX_HOSTNAME_LEN];
+
+/* check arriving CLC proposal */
+static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc)
+{
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_msg_hdr *hdr = &pclc->hdr;
+ struct smc_clc_v2_extension *v2_ext;
+
+ v2_ext = smc_get_clc_v2_ext(pclc);
+ pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+ if (hdr->version == SMC_V1) {
+ if (hdr->typev1 == SMC_TYPE_N)
+ return false;
+ if (ntohs(hdr->length) !=
+ sizeof(*pclc) + ntohs(pclc->iparea_offset) +
+ sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(struct smc_clc_ipv6_prefix) +
+ sizeof(struct smc_clc_msg_trail))
+ return false;
+ } else {
+ if (ntohs(hdr->length) !=
+ sizeof(*pclc) +
+ sizeof(struct smc_clc_msg_smcd) +
+ (hdr->typev1 != SMC_TYPE_N ?
+ sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(struct smc_clc_ipv6_prefix) : 0) +
+ (hdr->typev2 != SMC_TYPE_N ?
+ sizeof(*v2_ext) +
+ v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN : 0) +
+ (smcd_indicated(hdr->typev2) ?
+ sizeof(*smcd_v2_ext) + v2_ext->hdr.ism_gid_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid) :
+ 0) +
+ sizeof(struct smc_clc_msg_trail))
+ return false;
+ }
+ return true;
+}
+
+/* check arriving CLC accept or confirm */
+static bool
+smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2)
+{
+ struct smc_clc_msg_hdr *hdr = &clc_v2->hdr;
+
+ if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D)
+ return false;
+ if (hdr->version == SMC_V1) {
+ if ((hdr->typev1 == SMC_TYPE_R &&
+ ntohs(hdr->length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) ||
+ (hdr->typev1 == SMC_TYPE_D &&
+ ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN))
+ return false;
+ } else {
+ if (hdr->typev1 == SMC_TYPE_D &&
+ ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 &&
+ (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 +
+ sizeof(struct smc_clc_first_contact_ext)))
+ return false;
+ }
+ return true;
+}
+
+static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len)
+{
+ memset(fce, 0, sizeof(*fce));
+ fce->os_type = SMC_CLC_OS_LINUX;
+ fce->release = SMC_RELEASE;
+ memcpy(fce->hostname, smc_hostname, sizeof(smc_hostname));
+ (*len) += sizeof(*fce);
+}
+
/* check if received message has a correct header length and contains valid
* heading and trailing eyecatchers
*/
static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl)
{
- struct smc_clc_msg_proposal_prefix *pclc_prfx;
- struct smc_clc_msg_accept_confirm *clc;
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2;
struct smc_clc_msg_proposal *pclc;
struct smc_clc_msg_decline *dclc;
struct smc_clc_msg_trail *trl;
@@ -51,29 +129,19 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl)
switch (clcm->type) {
case SMC_CLC_PROPOSAL:
pclc = (struct smc_clc_msg_proposal *)clcm;
- pclc_prfx = smc_clc_proposal_get_prefix(pclc);
- if (ntohs(pclc->hdr.length) <
- sizeof(*pclc) + ntohs(pclc->iparea_offset) +
- sizeof(*pclc_prfx) +
- pclc_prfx->ipv6_prefixes_cnt *
- sizeof(struct smc_clc_ipv6_prefix) +
- sizeof(*trl))
+ if (!smc_clc_msg_prop_valid(pclc))
return false;
trl = (struct smc_clc_msg_trail *)
((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl));
break;
case SMC_CLC_ACCEPT:
case SMC_CLC_CONFIRM:
- if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D)
- return false;
- clc = (struct smc_clc_msg_accept_confirm *)clcm;
- if ((clcm->path == SMC_TYPE_R &&
- ntohs(clc->hdr.length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) ||
- (clcm->path == SMC_TYPE_D &&
- ntohs(clc->hdr.length) != SMCD_CLC_ACCEPT_CONFIRM_LEN))
+ clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)clcm;
+ if (!smc_clc_msg_acc_conf_valid(clc_v2))
return false;
trl = (struct smc_clc_msg_trail *)
- ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl));
+ ((u8 *)clc_v2 + ntohs(clc_v2->hdr.length) -
+ sizeof(*trl));
break;
case SMC_CLC_DECLINE:
dclc = (struct smc_clc_msg_decline *)clcm;
@@ -153,7 +221,6 @@ static int smc_clc_prfx_set(struct socket *clcsock,
struct sockaddr_in *addr;
int rc = -ENOENT;
- memset(prop, 0, sizeof(*prop));
if (!dst) {
rc = -ENOTCONN;
goto out;
@@ -320,7 +387,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
}
datlen = ntohs(clcm->length);
if ((len < sizeof(struct smc_clc_msg_hdr)) ||
- (clcm->version < SMC_CLC_V1) ||
+ (clcm->version < SMC_V1) ||
((clcm->type != SMC_CLC_DECLINE) &&
(clcm->type != expected_type))) {
smc->sk.sk_err = EPROTO;
@@ -328,9 +395,6 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
goto out;
}
- if (clcm->type == SMC_CLC_PROPOSAL && clcm->path == SMC_TYPE_N)
- reason_code = SMC_CLC_DECL_VERSMISMAT; /* just V2 offered */
-
/* receive the complete CLC message */
memset(&msg, 0, sizeof(struct msghdr));
if (datlen > buflen) {
@@ -366,7 +430,8 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
dclc = (struct smc_clc_msg_decline *)clcm;
reason_code = SMC_CLC_DECL_PEERDECL;
smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
- if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
+ if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 &
+ SMC_FIRST_CONTACT_MASK) {
smc->conn.lgr->sync_err = 1;
smc_lgr_terminate_sched(smc->conn.lgr);
}
@@ -378,7 +443,7 @@ out:
}
/* send CLC DECLINE message across internal TCP socket */
-int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version)
{
struct smc_clc_msg_decline dclc;
struct msghdr msg;
@@ -389,8 +454,10 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
dclc.hdr.type = SMC_CLC_DECLINE;
dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
- dclc.hdr.version = SMC_CLC_V1;
- dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0;
+ dclc.hdr.version = version;
+ dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX;
+ dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ?
+ SMC_FIRST_CONTACT_MASK : 0;
if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) &&
smc_ib_is_valid_local_systemid())
memcpy(dclc.id_for_peer, local_systemid,
@@ -409,142 +476,274 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
}
/* send CLC PROPOSAL message across internal TCP socket */
-int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
- struct smc_init_info *ini)
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
{
- struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
- struct smc_clc_msg_proposal_prefix pclc_prfx;
- struct smc_clc_msg_smcd pclc_smcd;
- struct smc_clc_msg_proposal pclc;
- struct smc_clc_msg_trail trl;
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct smc_clc_msg_proposal *pclc_base;
+ struct smc_clc_smcd_gid_chid *gidchids;
+ struct smc_clc_msg_proposal_area *pclc;
+ struct smc_clc_ipv6_prefix *ipv6_prfx;
+ struct smc_clc_v2_extension *v2_ext;
+ struct smc_clc_msg_smcd *pclc_smcd;
+ struct smc_clc_msg_trail *trl;
int len, i, plen, rc;
int reason_code = 0;
- struct kvec vec[5];
+ struct kvec vec[8];
struct msghdr msg;
+ pclc = kzalloc(sizeof(*pclc), GFP_KERNEL);
+ if (!pclc)
+ return -ENOMEM;
+
+ pclc_base = &pclc->pclc_base;
+ pclc_smcd = &pclc->pclc_smcd;
+ pclc_prfx = &pclc->pclc_prfx;
+ ipv6_prfx = pclc->pclc_prfx_ipv6;
+ v2_ext = &pclc->pclc_v2_ext;
+ smcd_v2_ext = &pclc->pclc_smcd_v2_ext;
+ gidchids = pclc->pclc_gidchids;
+ trl = &pclc->pclc_trl;
+
+ pclc_base->hdr.version = SMC_V2;
+ pclc_base->hdr.typev1 = ini->smc_type_v1;
+ pclc_base->hdr.typev2 = ini->smc_type_v2;
+ plen = sizeof(*pclc_base) + sizeof(*pclc_smcd) + sizeof(*trl);
+
/* retrieve ip prefixes for CLC proposal msg */
- rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx);
- if (rc)
- return SMC_CLC_DECL_CNFERR; /* configuration error */
+ if (ini->smc_type_v1 != SMC_TYPE_N) {
+ rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx);
+ if (rc) {
+ if (ini->smc_type_v2 == SMC_TYPE_N) {
+ kfree(pclc);
+ return SMC_CLC_DECL_CNFERR;
+ }
+ pclc_base->hdr.typev1 = SMC_TYPE_N;
+ } else {
+ pclc_base->iparea_offset = htons(sizeof(*pclc_smcd));
+ plen += sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ }
+ }
- /* send SMC Proposal CLC message */
- plen = sizeof(pclc) + sizeof(pclc_prfx) +
- (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) +
- sizeof(trl);
- memset(&pclc, 0, sizeof(pclc));
- memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
- pclc.hdr.type = SMC_CLC_PROPOSAL;
- pclc.hdr.version = SMC_CLC_V1; /* SMC version */
- pclc.hdr.path = smc_type;
- if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) {
+ /* build SMC Proposal CLC message */
+ memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ pclc_base->hdr.type = SMC_CLC_PROPOSAL;
+ if (smcr_indicated(ini->smc_type_v1)) {
/* add SMC-R specifics */
- memcpy(pclc.lcl.id_for_peer, local_systemid,
+ memcpy(pclc_base->lcl.id_for_peer, local_systemid,
sizeof(local_systemid));
- memcpy(&pclc.lcl.gid, ini->ib_gid, SMC_GID_SIZE);
- memcpy(&pclc.lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1],
+ memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE);
+ memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1],
ETH_ALEN);
- pclc.iparea_offset = htons(0);
}
- if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
+ if (smcd_indicated(ini->smc_type_v1)) {
/* add SMC-D specifics */
- memset(&pclc_smcd, 0, sizeof(pclc_smcd));
- plen += sizeof(pclc_smcd);
- pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET);
- pclc_smcd.gid = ini->ism_dev->local_gid;
+ if (ini->ism_dev[0]) {
+ pclc_smcd->ism.gid = htonll(ini->ism_dev[0]->local_gid);
+ pclc_smcd->ism.chid =
+ htons(smc_ism_get_chid(ini->ism_dev[0]));
+ }
}
- pclc.hdr.length = htons(plen);
+ if (ini->smc_type_v2 == SMC_TYPE_N) {
+ pclc_smcd->v2_ext_offset = 0;
+ } else {
+ u16 v2_ext_offset;
+ u8 *eid = NULL;
+
+ v2_ext_offset = sizeof(*pclc_smcd) -
+ offsetofend(struct smc_clc_msg_smcd, v2_ext_offset);
+ if (ini->smc_type_v1 != SMC_TYPE_N)
+ v2_ext_offset += sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ pclc_smcd->v2_ext_offset = htons(v2_ext_offset);
+ v2_ext->hdr.eid_cnt = 0;
+ v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt;
+ v2_ext->hdr.flag.release = SMC_RELEASE;
+ v2_ext->hdr.flag.seid = 1;
+ v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) -
+ offsetofend(struct smc_clnt_opts_area_hdr,
+ smcd_v2_ext_offset) +
+ v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN);
+ if (ini->ism_dev[0])
+ smc_ism_get_system_eid(ini->ism_dev[0], &eid);
+ else
+ smc_ism_get_system_eid(ini->ism_dev[1], &eid);
+ if (eid)
+ memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN);
+ plen += sizeof(*v2_ext) + sizeof(*smcd_v2_ext);
+ if (ini->ism_offered_cnt) {
+ for (i = 1; i <= ini->ism_offered_cnt; i++) {
+ gidchids[i - 1].gid =
+ htonll(ini->ism_dev[i]->local_gid);
+ gidchids[i - 1].chid =
+ htons(smc_ism_get_chid(ini->ism_dev[i]));
+ }
+ plen += ini->ism_offered_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid);
+ }
+ }
+ pclc_base->hdr.length = htons(plen);
+ memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
- memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ /* send SMC Proposal CLC message */
memset(&msg, 0, sizeof(msg));
i = 0;
- vec[i].iov_base = &pclc;
- vec[i++].iov_len = sizeof(pclc);
- if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
- vec[i].iov_base = &pclc_smcd;
- vec[i++].iov_len = sizeof(pclc_smcd);
+ vec[i].iov_base = pclc_base;
+ vec[i++].iov_len = sizeof(*pclc_base);
+ vec[i].iov_base = pclc_smcd;
+ vec[i++].iov_len = sizeof(*pclc_smcd);
+ if (ini->smc_type_v1 != SMC_TYPE_N) {
+ vec[i].iov_base = pclc_prfx;
+ vec[i++].iov_len = sizeof(*pclc_prfx);
+ if (pclc_prfx->ipv6_prefixes_cnt > 0) {
+ vec[i].iov_base = ipv6_prfx;
+ vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ }
}
- vec[i].iov_base = &pclc_prfx;
- vec[i++].iov_len = sizeof(pclc_prfx);
- if (pclc_prfx.ipv6_prefixes_cnt > 0) {
- vec[i].iov_base = &ipv6_prfx[0];
- vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt *
- sizeof(ipv6_prfx[0]);
+ if (ini->smc_type_v2 != SMC_TYPE_N) {
+ vec[i].iov_base = v2_ext;
+ vec[i++].iov_len = sizeof(*v2_ext);
+ vec[i].iov_base = smcd_v2_ext;
+ vec[i++].iov_len = sizeof(*smcd_v2_ext);
+ if (ini->ism_offered_cnt) {
+ vec[i].iov_base = gidchids;
+ vec[i++].iov_len = ini->ism_offered_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid);
+ }
}
- vec[i].iov_base = &trl;
- vec[i++].iov_len = sizeof(trl);
+ vec[i].iov_base = trl;
+ vec[i++].iov_len = sizeof(*trl);
/* due to the few bytes needed for clc-handshake this cannot block */
len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen);
if (len < 0) {
smc->sk.sk_err = smc->clcsock->sk->sk_err;
reason_code = -smc->sk.sk_err;
- } else if (len < (int)sizeof(pclc)) {
+ } else if (len < ntohs(pclc_base->hdr.length)) {
reason_code = -ENETUNREACH;
smc->sk.sk_err = -reason_code;
}
+ kfree(pclc);
return reason_code;
}
-/* send CLC CONFIRM message across internal TCP socket */
-int smc_clc_send_confirm(struct smc_sock *smc)
+/* build and send CLC CONFIRM / ACCEPT message */
+static int smc_clc_send_confirm_accept(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2,
+ int first_contact, u8 version)
{
struct smc_connection *conn = &smc->conn;
- struct smc_clc_msg_accept_confirm cclc;
- struct smc_link *link;
- int reason_code = 0;
+ struct smc_clc_msg_accept_confirm *clc;
+ struct smc_clc_first_contact_ext fce;
+ struct smc_clc_msg_trail trl;
+ struct kvec vec[3];
struct msghdr msg;
- struct kvec vec;
- int len;
+ int i, len;
/* send SMC Confirm CLC msg */
- memset(&cclc, 0, sizeof(cclc));
- cclc.hdr.type = SMC_CLC_CONFIRM;
- cclc.hdr.version = SMC_CLC_V1; /* SMC version */
- if (smc->conn.lgr->is_smcd) {
+ clc = (struct smc_clc_msg_accept_confirm *)clc_v2;
+ clc->hdr.version = version; /* SMC version */
+ if (first_contact)
+ clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK;
+ if (conn->lgr->is_smcd) {
/* SMC-D specific settings */
- memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER,
+ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER,
sizeof(SMCD_EYECATCHER));
- cclc.hdr.path = SMC_TYPE_D;
- cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
- cclc.gid = conn->lgr->smcd->local_gid;
- cclc.token = conn->rmb_desc->token;
- cclc.dmbe_size = conn->rmbe_size_short;
- cclc.dmbe_idx = 0;
- memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
- memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
+ clc->hdr.typev1 = SMC_TYPE_D;
+ clc->d0.gid = conn->lgr->smcd->local_gid;
+ clc->d0.token = conn->rmb_desc->token;
+ clc->d0.dmbe_size = conn->rmbe_size_short;
+ clc->d0.dmbe_idx = 0;
+ memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
+ if (version == SMC_V1) {
+ clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
+ } else {
+ u8 *eid = NULL;
+
+ clc_v2->chid = htons(smc_ism_get_chid(conn->lgr->smcd));
+ smc_ism_get_system_eid(conn->lgr->smcd, &eid);
+ if (eid)
+ memcpy(clc_v2->eid, eid, SMC_MAX_EID_LEN);
+ len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2;
+ if (first_contact)
+ smc_clc_fill_fce(&fce, &len);
+ clc_v2->hdr.length = htons(len);
+ }
+ memcpy(trl.eyecatcher, SMCD_EYECATCHER,
sizeof(SMCD_EYECATCHER));
} else {
+ struct smc_link *link = conn->lnk;
+
/* SMC-R specific settings */
link = conn->lnk;
- memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER,
+ memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER,
sizeof(SMC_EYECATCHER));
- cclc.hdr.path = SMC_TYPE_R;
- cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
- memcpy(cclc.lcl.id_for_peer, local_systemid,
+ clc->hdr.typev1 = SMC_TYPE_R;
+ clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(clc->r0.lcl.id_for_peer, local_systemid,
sizeof(local_systemid));
- memcpy(&cclc.lcl.gid, link->gid, SMC_GID_SIZE);
- memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
+ memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE);
+ memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
ETH_ALEN);
- hton24(cclc.qpn, link->roce_qp->qp_num);
- cclc.rmb_rkey =
+ hton24(clc->r0.qpn, link->roce_qp->qp_num);
+ clc->r0.rmb_rkey =
htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey);
- cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
- cclc.rmbe_alert_token = htonl(conn->alert_token_local);
- cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
- cclc.rmbe_size = conn->rmbe_size_short;
- cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
+ clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local);
+ switch (clc->hdr.type) {
+ case SMC_CLC_ACCEPT:
+ clc->r0.qp_mtu = link->path_mtu;
+ break;
+ case SMC_CLC_CONFIRM:
+ clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu);
+ break;
+ }
+ clc->r0.rmbe_size = conn->rmbe_size_short;
+ clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
(conn->rmb_desc->sgt[link->link_idx].sgl));
- hton24(cclc.psn, link->psn_initial);
- memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
- sizeof(SMC_EYECATCHER));
+ hton24(clc->r0.psn, link->psn_initial);
+ memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
}
memset(&msg, 0, sizeof(msg));
- vec.iov_base = &cclc;
- vec.iov_len = ntohs(cclc.hdr.length);
- len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
- ntohs(cclc.hdr.length));
- if (len < ntohs(cclc.hdr.length)) {
+ i = 0;
+ vec[i].iov_base = clc_v2;
+ if (version > SMC_V1)
+ vec[i++].iov_len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 - sizeof(trl);
+ else
+ vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ?
+ SMCD_CLC_ACCEPT_CONFIRM_LEN :
+ SMCR_CLC_ACCEPT_CONFIRM_LEN) -
+ sizeof(trl);
+ if (version > SMC_V1 && first_contact) {
+ vec[i].iov_base = &fce;
+ vec[i++].iov_len = sizeof(fce);
+ }
+ vec[i].iov_base = &trl;
+ vec[i++].iov_len = sizeof(trl);
+ return kernel_sendmsg(smc->clcsock, &msg, vec, 1,
+ ntohs(clc->hdr.length));
+}
+
+/* send CLC CONFIRM message across internal TCP socket */
+int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
+ u8 version)
+{
+ struct smc_clc_msg_accept_confirm_v2 cclc_v2;
+ int reason_code = 0;
+ int len;
+
+ /* send SMC Confirm CLC msg */
+ memset(&cclc_v2, 0, sizeof(cclc_v2));
+ cclc_v2.hdr.type = SMC_CLC_CONFIRM;
+ len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact,
+ version);
+ if (len < ntohs(cclc_v2.hdr.length)) {
if (len >= 0) {
reason_code = -ENETUNREACH;
smc->sk.sk_err = -reason_code;
@@ -557,67 +756,33 @@ int smc_clc_send_confirm(struct smc_sock *smc)
}
/* send CLC ACCEPT message across internal TCP socket */
-int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
+int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact,
+ u8 version)
{
- struct smc_connection *conn = &new_smc->conn;
- struct smc_clc_msg_accept_confirm aclc;
- struct smc_link *link;
- struct msghdr msg;
- struct kvec vec;
+ struct smc_clc_msg_accept_confirm_v2 aclc_v2;
int len;
- memset(&aclc, 0, sizeof(aclc));
- aclc.hdr.type = SMC_CLC_ACCEPT;
- aclc.hdr.version = SMC_CLC_V1; /* SMC version */
- if (srv_first_contact)
- aclc.hdr.flag = 1;
-
- if (new_smc->conn.lgr->is_smcd) {
- /* SMC-D specific settings */
- aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
- memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER,
- sizeof(SMCD_EYECATCHER));
- aclc.hdr.path = SMC_TYPE_D;
- aclc.gid = conn->lgr->smcd->local_gid;
- aclc.token = conn->rmb_desc->token;
- aclc.dmbe_size = conn->rmbe_size_short;
- aclc.dmbe_idx = 0;
- memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
- memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
- sizeof(SMCD_EYECATCHER));
- } else {
- /* SMC-R specific settings */
- aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
- memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER,
- sizeof(SMC_EYECATCHER));
- aclc.hdr.path = SMC_TYPE_R;
- link = conn->lnk;
- memcpy(aclc.lcl.id_for_peer, local_systemid,
- sizeof(local_systemid));
- memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE);
- memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
- ETH_ALEN);
- hton24(aclc.qpn, link->roce_qp->qp_num);
- aclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey);
- aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */
- aclc.rmbe_alert_token = htonl(conn->alert_token_local);
- aclc.qp_mtu = link->path_mtu;
- aclc.rmbe_size = conn->rmbe_size_short,
- aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
- (conn->rmb_desc->sgt[link->link_idx].sgl));
- hton24(aclc.psn, link->psn_initial);
- memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
- sizeof(SMC_EYECATCHER));
- }
-
- memset(&msg, 0, sizeof(msg));
- vec.iov_base = &aclc;
- vec.iov_len = ntohs(aclc.hdr.length);
- len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1,
- ntohs(aclc.hdr.length));
- if (len < ntohs(aclc.hdr.length))
+ memset(&aclc_v2, 0, sizeof(aclc_v2));
+ aclc_v2.hdr.type = SMC_CLC_ACCEPT;
+ len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact,
+ version);
+ if (len < ntohs(aclc_v2.hdr.length))
len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err;
return len > 0 ? 0 : len;
}
+
+void smc_clc_get_hostname(u8 **host)
+{
+ *host = &smc_hostname[0];
+}
+
+void __init smc_clc_init(void)
+{
+ struct new_utsname *u;
+
+ memset(smc_hostname, _S, sizeof(smc_hostname)); /* ASCII blanks */
+ u = utsname();
+ memcpy(smc_hostname, u->nodename,
+ min_t(size_t, strlen(u->nodename), sizeof(smc_hostname)));
+}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index cf7b45306f4e..32d37f7b70f2 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -22,7 +22,6 @@
#define SMC_CLC_CONFIRM 0x03
#define SMC_CLC_DECLINE 0x04
-#define SMC_CLC_V1 0x1 /* SMC version */
#define SMC_TYPE_R 0 /* SMC-R only */
#define SMC_TYPE_D 1 /* SMC-D only */
#define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */
@@ -38,7 +37,11 @@
#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */
#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */
#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */
-#define SMC_CLC_DECL_SMCDNOTALK 0x03030003 /* SMC-D dev can't talk to peer */
+#define SMC_CLC_DECL_NOISM2SUPP 0x03030003 /* hardware has no ISMv2 support */
+#define SMC_CLC_DECL_NOV2EXT 0x03030004 /* peer sent no clc v2 extension */
+#define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */
+#define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */
+#define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */
#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
@@ -56,19 +59,19 @@
#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */
#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */
+#define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */
+
struct smc_clc_msg_hdr { /* header1 of clc messages */
u8 eyecatcher[4]; /* eye catcher */
u8 type; /* proposal / accept / confirm / decline */
__be16 length;
#if defined(__BIG_ENDIAN_BITFIELD)
u8 version : 4,
- flag : 1,
- rsvd : 1,
- path : 2;
+ typev2 : 2,
+ typev1 : 2;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 path : 2,
- rsvd : 1,
- flag : 1,
+ u8 typev1 : 2,
+ typev2 : 2,
version : 4;
#endif
} __packed; /* format defined in RFC7609 */
@@ -83,8 +86,6 @@ struct smc_clc_msg_local { /* header2 of clc messages */
u8 mac[6]; /* mac of ib_device port */
};
-#define SMC_CLC_MAX_V6_PREFIX 8
-
/* Struct would be 4 byte aligned, but it is used in an array that is sent
* to peers and must conform to RFC7609, hence we need to use packed here.
*/
@@ -93,6 +94,44 @@ struct smc_clc_ipv6_prefix {
u8 prefix_len;
} __packed; /* format defined in RFC7609 */
+#if defined(__BIG_ENDIAN_BITFIELD)
+struct smc_clc_v2_flag {
+ u8 release : 4,
+ rsvd : 3,
+ seid : 1;
+};
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+struct smc_clc_v2_flag {
+ u8 seid : 1,
+ rsvd : 3,
+ release : 4;
+};
+#endif
+
+struct smc_clnt_opts_area_hdr {
+ u8 eid_cnt; /* number of user defined EIDs */
+ u8 ism_gid_cnt; /* number of ISMv2 GIDs */
+ u8 reserved1;
+ struct smc_clc_v2_flag flag;
+ u8 reserved2[2];
+ __be16 smcd_v2_ext_offset; /* SMC-Dv2 Extension Offset */
+};
+
+struct smc_clc_smcd_gid_chid {
+ __be64 gid; /* ISM GID */
+ __be16 chid; /* ISMv2 CHID */
+} __packed; /* format defined in
+ * IBM Shared Memory Communications Version 2
+ * (https://www.ibm.com/support/pages/node/6326337)
+ */
+
+struct smc_clc_v2_extension {
+ struct smc_clnt_opts_area_hdr hdr;
+ u8 roce[16]; /* RoCEv2 GID */
+ u8 reserved[16];
+ u8 user_eids[][SMC_MAX_EID_LEN];
+};
+
struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
__be32 outgoing_subnet; /* subnet mask */
u8 prefix_len; /* number of significant bits in mask */
@@ -101,8 +140,15 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
} __aligned(4);
struct smc_clc_msg_smcd { /* SMC-D GID information */
- u64 gid; /* ISM GID of requestor */
- u8 res[32];
+ struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */
+ __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */
+ u8 reserved[28];
+};
+
+struct smc_clc_smcd_v2_extension {
+ u8 system_eid[SMC_MAX_EID_LEN];
+ u8 reserved[16];
+ struct smc_clc_smcd_gid_chid gidchid[];
};
struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
@@ -111,64 +157,107 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
__be16 iparea_offset; /* offset to IP address information area */
} __aligned(4);
-#define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28
-#define SMC_CLC_PROPOSAL_MAX_PREFIX (SMC_CLC_MAX_V6_PREFIX * \
- sizeof(struct smc_clc_ipv6_prefix))
-#define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \
- SMC_CLC_PROPOSAL_MAX_OFFSET + \
- sizeof(struct smc_clc_msg_proposal_prefix) + \
- SMC_CLC_PROPOSAL_MAX_PREFIX + \
- sizeof(struct smc_clc_msg_trail))
+#define SMC_CLC_MAX_V6_PREFIX 8
-struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
- struct smc_clc_msg_hdr hdr;
- union {
- struct { /* SMC-R */
- struct smc_clc_msg_local lcl;
- u8 qpn[3]; /* QP number */
- __be32 rmb_rkey; /* RMB rkey */
- u8 rmbe_idx; /* Index of RMBE in RMB */
- __be32 rmbe_alert_token;/* unique connection id */
+struct smc_clc_msg_proposal_area {
+ struct smc_clc_msg_proposal pclc_base;
+ struct smc_clc_msg_smcd pclc_smcd;
+ struct smc_clc_msg_proposal_prefix pclc_prfx;
+ struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX];
+ struct smc_clc_v2_extension pclc_v2_ext;
+ struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext;
+ struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS];
+ struct smc_clc_msg_trail pclc_trl;
+};
+
+struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */
+ struct smc_clc_msg_local lcl;
+ u8 qpn[3]; /* QP number */
+ __be32 rmb_rkey; /* RMB rkey */
+ u8 rmbe_idx; /* Index of RMBE in RMB */
+ __be32 rmbe_alert_token; /* unique connection id */
+ #if defined(__BIG_ENDIAN_BITFIELD)
+ u8 rmbe_size : 4, /* buf size (compressed) */
+ qp_mtu : 4; /* QP mtu */
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 qp_mtu : 4,
+ rmbe_size : 4;
+#endif
+ u8 reserved;
+ __be64 rmb_dma_addr; /* RMB virtual address */
+ u8 reserved2;
+ u8 psn[3]; /* packet sequence number */
+} __packed;
+
+struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */
+ u64 gid; /* Sender GID */
+ u64 token; /* DMB token */
+ u8 dmbe_idx; /* DMBE index */
#if defined(__BIG_ENDIAN_BITFIELD)
- u8 rmbe_size : 4, /* buf size (compressed) */
- qp_mtu : 4; /* QP mtu */
+ u8 dmbe_size : 4, /* buf size (compressed) */
+ reserved3 : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 qp_mtu : 4,
- rmbe_size : 4;
+ u8 reserved3 : 4,
+ dmbe_size : 4;
#endif
- u8 reserved;
- __be64 rmb_dma_addr; /* RMB virtual address */
- u8 reserved2;
- u8 psn[3]; /* packet sequence number */
- struct smc_clc_msg_trail smcr_trl;
- /* eye catcher "SMCR" EBCDIC */
- } __packed;
- struct { /* SMC-D */
- u64 gid; /* Sender GID */
- u64 token; /* DMB token */
- u8 dmbe_idx; /* DMBE index */
+ u16 reserved4;
+ __be32 linkid; /* Link identifier */
+} __packed;
+
+#define SMC_CLC_OS_ZOS 1
+#define SMC_CLC_OS_LINUX 2
+#define SMC_CLC_OS_AIX 3
+
+struct smc_clc_first_contact_ext {
+ u8 reserved1;
#if defined(__BIG_ENDIAN_BITFIELD)
- u8 dmbe_size : 4, /* buf size (compressed) */
- reserved3 : 4;
+ u8 os_type : 4,
+ release : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 reserved3 : 4,
- dmbe_size : 4;
+ u8 release : 4,
+ os_type : 4;
#endif
- u16 reserved4;
- u32 linkid; /* Link identifier */
+ u8 reserved2[2];
+ u8 hostname[SMC_MAX_HOSTNAME_LEN];
+};
+
+struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ union {
+ struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
+ struct { /* SMC-D */
+ struct smcd_clc_msg_accept_confirm_common d0;
u32 reserved5[3];
- struct smc_clc_msg_trail smcd_trl;
- /* eye catcher "SMCD" EBCDIC */
- } __packed;
+ };
};
} __packed; /* format defined in RFC7609 */
+struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ union {
+ struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
+ struct { /* SMC-D */
+ struct smcd_clc_msg_accept_confirm_common d0;
+ __be16 chid;
+ u8 eid[SMC_MAX_EID_LEN];
+ u8 reserved5[8];
+ };
+ };
+};
+
struct smc_clc_msg_decline { /* clc decline message */
struct smc_clc_msg_hdr hdr;
u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
__be32 peer_diagnosis; /* diagnosis information */
- u8 reserved2[4];
- struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 os_type : 4,
+ reserved : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 4,
+ os_type : 4;
+#endif
+ u8 reserved2[3];
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
} __aligned(4);
/* determine start of the prefix area within the proposal message */
@@ -179,16 +268,58 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
}
+static inline bool smcr_indicated(int smc_type)
+{
+ return smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B;
+}
+
+static inline bool smcd_indicated(int smc_type)
+{
+ return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B;
+}
+
/* get SMC-D info from proposal message */
static inline struct smc_clc_msg_smcd *
smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
{
- if (ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
+ if (smcd_indicated(prop->hdr.typev1) &&
+ ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
return NULL;
return (struct smc_clc_msg_smcd *)(prop + 1);
}
+static inline struct smc_clc_v2_extension *
+smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop)
+{
+ struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop);
+
+ if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset))
+ return NULL;
+
+ return (struct smc_clc_v2_extension *)
+ ((u8 *)prop_smcd +
+ offsetof(struct smc_clc_msg_smcd, v2_ext_offset) +
+ sizeof(prop_smcd->v2_ext_offset) +
+ ntohs(prop_smcd->v2_ext_offset));
+}
+
+static inline struct smc_clc_smcd_v2_extension *
+smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext)
+{
+ if (!prop_v2ext)
+ return NULL;
+ if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset))
+ return NULL;
+
+ return (struct smc_clc_smcd_v2_extension *)
+ ((u8 *)prop_v2ext +
+ offsetof(struct smc_clc_v2_extension, hdr) +
+ offsetof(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) +
+ sizeof(prop_v2ext->hdr.smcd_v2_ext_offset) +
+ ntohs(prop_v2ext->hdr.smcd_v2_ext_offset));
+}
+
struct smcd_dev;
struct smc_init_info;
@@ -196,10 +327,13 @@ int smc_clc_prfx_match(struct socket *clcsock,
struct smc_clc_msg_proposal_prefix *prop);
int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
u8 expected_type, unsigned long timeout);
-int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
-int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
- struct smc_init_info *ini);
-int smc_clc_send_confirm(struct smc_sock *smc);
-int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version);
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini);
+int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
+ u8 version);
+int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact,
+ u8 version);
+void smc_clc_init(void) __init;
+void smc_clc_get_hostname(u8 **host);
#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 0e7409e469c0..0f9ffba07d26 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -210,9 +210,9 @@ again:
sk->sk_state = SMC_CLOSED;
sk->sk_state_change(sk); /* wake up accept */
if (smc->clcsock && smc->clcsock->sk) {
+ smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
+ smc->clcsock->sk->sk_user_data = NULL;
rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
- /* wake up kernel_accept of smc_tcp_listen_worker */
- smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
}
smc_close_cleanup_listen(sk);
release_sock(sk);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index a406627b1d55..0df85a12651e 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -16,6 +16,8 @@
#include <linux/wait.h>
#include <linux/reboot.h>
#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/smc.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <rdma/ib_verbs.h>
@@ -30,13 +32,13 @@
#include "smc_cdc.h"
#include "smc_close.h"
#include "smc_ism.h"
+#include "smc_netlink.h"
#define SMC_LGR_NUM_INCR 256
#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
-#define SMC_LGR_FREE_DELAY_FAST (8 * HZ)
-static struct smc_lgr_list smc_lgr_list = { /* established link groups */
+struct smc_lgr_list smc_lgr_list = { /* established link groups */
.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
.list = LIST_HEAD_INIT(smc_lgr_list.list),
.num = 0,
@@ -64,13 +66,23 @@ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
return &smc_lgr_list.list;
}
+static void smc_ibdev_cnt_inc(struct smc_link *lnk)
+{
+ atomic_inc(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]);
+}
+
+static void smc_ibdev_cnt_dec(struct smc_link *lnk)
+{
+ atomic_dec(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]);
+}
+
static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
{
/* client link group creation always follows the server link group
* creation. For client use a somewhat higher removal delay time,
* otherwise there is a risk of out-of-sync link groups.
*/
- if (!lgr->freeing && !lgr->freefast) {
+ if (!lgr->freeing) {
mod_delayed_work(system_wq, &lgr->free_work,
(!lgr->is_smcd && lgr->role == SMC_CLNT) ?
SMC_LGR_FREE_DELAY_CLNT :
@@ -78,15 +90,6 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
}
}
-void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
-{
- if (!lgr->freeing && !lgr->freefast) {
- lgr->freefast = 1;
- mod_delayed_work(system_wq, &lgr->free_work,
- SMC_LGR_FREE_DELAY_FAST);
- }
-}
-
/* Register connection's alert token in our lookup structure.
* To use rbtrees we have to implement our own insert core.
* Requires @conns_lock
@@ -149,6 +152,7 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first)
}
if (!conn->lnk)
return SMC_CLC_DECL_NOACTLINK;
+ atomic_inc(&conn->lnk->conn_cnt);
return 0;
}
@@ -190,6 +194,8 @@ static void __smc_lgr_unregister_conn(struct smc_connection *conn)
struct smc_link_group *lgr = conn->lgr;
rb_erase(&conn->alert_node, &lgr->conns_all);
+ if (conn->lnk)
+ atomic_dec(&conn->lnk->conn_cnt);
lgr->conns_num--;
conn->alert_token_local = 0;
sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
@@ -211,6 +217,367 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
conn->lgr = NULL;
}
+int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ char hostname[SMC_MAX_HOSTNAME_LEN + 1];
+ char smc_seid[SMC_MAX_EID_LEN + 1];
+ struct smcd_dev *smcd_dev;
+ struct nlattr *attrs;
+ u8 *seid = NULL;
+ u8 *host = NULL;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_SYS_INFO);
+ if (!nlh)
+ goto errmsg;
+ if (cb_ctx->pos[0])
+ goto errout;
+ attrs = nla_nest_start(skb, SMC_GEN_SYS_INFO);
+ if (!attrs)
+ goto errout;
+ if (nla_put_u8(skb, SMC_NLA_SYS_VER, SMC_V2))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_SYS_REL, SMC_RELEASE))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable()))
+ goto errattr;
+ smc_clc_get_hostname(&host);
+ if (host) {
+ memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN);
+ hostname[SMC_MAX_HOSTNAME_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname))
+ goto errattr;
+ }
+ mutex_lock(&smcd_dev_list.mutex);
+ smcd_dev = list_first_entry_or_null(&smcd_dev_list.list,
+ struct smcd_dev, list);
+ if (smcd_dev)
+ smc_ism_get_system_eid(smcd_dev, &seid);
+ mutex_unlock(&smcd_dev_list.mutex);
+ if (seid && smc_ism_is_v2_capable()) {
+ memcpy(smc_seid, seid, SMC_MAX_EID_LEN);
+ smc_seid[SMC_MAX_EID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid))
+ goto errattr;
+ }
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ cb_ctx->pos[0] = 1;
+ return skb->len;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return skb->len;
+}
+
+static int smc_nl_fill_lgr(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_target[SMC_MAX_PNETID_LEN + 1];
+ struct nlattr *attrs;
+
+ attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR);
+ if (!attrs)
+ goto errout;
+
+ if (nla_put_u32(skb, SMC_NLA_LGR_R_ID, *((u32 *)&lgr->id)))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LGR_R_CONNS_NUM, lgr->conns_num))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_ROLE, lgr->role))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id))
+ goto errattr;
+ memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN);
+ smc_target[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ return 0;
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_lgr_link(struct smc_link_group *lgr,
+ struct smc_link *link,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_ibname[IB_DEVICE_NAME_MAX];
+ u8 smc_gid_target[41];
+ struct nlattr *attrs;
+ u32 link_uid = 0;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_LINK_SMCR);
+ if (!nlh)
+ goto errmsg;
+
+ attrs = nla_nest_start(skb, SMC_GEN_LINK_SMCR);
+ if (!attrs)
+ goto errout;
+
+ if (nla_put_u8(skb, SMC_NLA_LINK_ID, link->link_id))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LINK_STATE, link->state))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LINK_CONN_CNT,
+ atomic_read(&link->conn_cnt)))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LINK_IB_PORT, link->ibport))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LINK_NET_DEV, link->ndev_ifidx))
+ goto errattr;
+ snprintf(smc_ibname, sizeof(smc_ibname), "%s", link->ibname);
+ if (nla_put_string(skb, SMC_NLA_LINK_IB_DEV, smc_ibname))
+ goto errattr;
+ memcpy(&link_uid, link->link_uid, sizeof(link_uid));
+ if (nla_put_u32(skb, SMC_NLA_LINK_UID, link_uid))
+ goto errattr;
+ memcpy(&link_uid, link->peer_link_uid, sizeof(link_uid));
+ if (nla_put_u32(skb, SMC_NLA_LINK_PEER_UID, link_uid))
+ goto errattr;
+ memset(smc_gid_target, 0, sizeof(smc_gid_target));
+ smc_gid_be16_convert(smc_gid_target, link->gid);
+ if (nla_put_string(skb, SMC_NLA_LINK_GID, smc_gid_target))
+ goto errattr;
+ memset(smc_gid_target, 0, sizeof(smc_gid_target));
+ smc_gid_be16_convert(smc_gid_target, link->peer_gid);
+ if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_handle_lgr(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ bool list_links)
+{
+ void *nlh;
+ int i;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_LGR_SMCR);
+ if (!nlh)
+ goto errmsg;
+ if (smc_nl_fill_lgr(lgr, skb, cb))
+ goto errout;
+
+ genlmsg_end(skb, nlh);
+ if (!list_links)
+ goto out;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_usable(&lgr->lnk[i]))
+ continue;
+ if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb))
+ goto errout;
+ }
+out:
+ return 0;
+
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ bool list_links)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smc_link_group *lgr;
+ int snum = cb_ctx->pos[0];
+ int num = 0;
+
+ spin_lock_bh(&smc_lgr->lock);
+ list_for_each_entry(lgr, &smc_lgr->list, list) {
+ if (num < snum)
+ goto next;
+ if (smc_nl_handle_lgr(lgr, skb, cb, list_links))
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ spin_unlock_bh(&smc_lgr->lock);
+ cb_ctx->pos[0] = num;
+}
+
+static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_host[SMC_MAX_HOSTNAME_LEN + 1];
+ char smc_pnet[SMC_MAX_PNETID_LEN + 1];
+ char smc_eid[SMC_MAX_EID_LEN + 1];
+ struct nlattr *v2_attrs;
+ struct nlattr *attrs;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_LGR_SMCD);
+ if (!nlh)
+ goto errmsg;
+
+ attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCD);
+ if (!attrs)
+ goto errout;
+
+ if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id)))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, lgr->smcd->local_gid,
+ SMC_NLA_LGR_D_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid,
+ SMC_NLA_LGR_D_PAD))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd)))
+ goto errattr;
+ memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN);
+ smc_pnet[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet))
+ goto errattr;
+
+ v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_V2);
+ if (!v2_attrs)
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version))
+ goto errv2attr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release))
+ goto errv2attr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os))
+ goto errv2attr;
+ memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN);
+ smc_host[SMC_MAX_HOSTNAME_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host))
+ goto errv2attr;
+ memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN);
+ smc_eid[SMC_MAX_EID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid))
+ goto errv2attr;
+
+ nla_nest_end(skb, v2_attrs);
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+
+errv2attr:
+ nla_nest_cancel(skb, v2_attrs);
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_handle_smcd_lgr(struct smcd_dev *dev,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smc_link_group *lgr;
+ int snum = cb_ctx->pos[1];
+ int rc = 0, num = 0;
+
+ spin_lock_bh(&dev->lgr_lock);
+ list_for_each_entry(lgr, &dev->lgr_list, list) {
+ if (!lgr->is_smcd)
+ continue;
+ if (num < snum)
+ goto next;
+ rc = smc_nl_fill_smcd_lgr(lgr, skb, cb);
+ if (rc)
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ spin_unlock_bh(&dev->lgr_lock);
+ cb_ctx->pos[1] = num;
+ return rc;
+}
+
+static int smc_nl_fill_smcd_dev(struct smcd_dev_list *dev_list,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smcd_dev *smcd_dev;
+ int snum = cb_ctx->pos[0];
+ int rc = 0, num = 0;
+
+ mutex_lock(&dev_list->mutex);
+ list_for_each_entry(smcd_dev, &dev_list->list, list) {
+ if (list_empty(&smcd_dev->lgr_list))
+ continue;
+ if (num < snum)
+ goto next;
+ rc = smc_nl_handle_smcd_lgr(smcd_dev, skb, cb);
+ if (rc)
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ mutex_unlock(&dev_list->mutex);
+ cb_ctx->pos[0] = num;
+ return rc;
+}
+
+int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ bool list_links = false;
+
+ smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links);
+ return skb->len;
+}
+
+int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ bool list_links = true;
+
+ smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links);
+ return skb->len;
+}
+
+int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ smc_nl_fill_smcd_dev(&smcd_dev_list, skb, cb);
+ return skb->len;
+}
+
void smc_lgr_cleanup_early(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
@@ -227,7 +594,7 @@ void smc_lgr_cleanup_early(struct smc_connection *conn)
if (!list_empty(lgr_list))
list_del_init(lgr_list);
spin_unlock_bh(lgr_lock);
- smc_lgr_schedule_free_work_fast(lgr);
+ __smc_lgr_terminate(lgr, true);
}
static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr)
@@ -310,6 +677,15 @@ static u8 smcr_next_link_id(struct smc_link_group *lgr)
return link_id;
}
+static void smcr_copy_dev_info_to_link(struct smc_link *link)
+{
+ struct smc_ib_device *smcibdev = link->smcibdev;
+
+ snprintf(link->ibname, sizeof(link->ibname), "%s",
+ smcibdev->ibdev->name);
+ link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1];
+}
+
int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
u8 link_idx, struct smc_init_info *ini)
{
@@ -323,7 +699,10 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
lnk->link_idx = link_idx;
lnk->smcibdev = ini->ib_dev;
lnk->ibport = ini->ib_port;
+ smc_ibdev_cnt_inc(lnk);
+ smcr_copy_dev_info_to_link(lnk);
lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
+ atomic_set(&lnk->conn_cnt, 0);
smc_llc_link_set_uid(lnk);
INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
if (!ini->ib_dev->initialized) {
@@ -365,6 +744,7 @@ free_link_mem:
clear_llc_lnk:
smc_llc_link_clear(lnk, false);
out:
+ smc_ibdev_cnt_dec(lnk);
put_device(&ini->ib_dev->ibdev->dev);
memset(lnk, 0, sizeof(struct smc_link));
lnk->state = SMC_LNK_UNUSED;
@@ -385,7 +765,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
int i;
if (ini->is_smcd && ini->vlan_id) {
- if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) {
+ if (smc_ism_get_vlan(ini->ism_dev[ini->ism_selected],
+ ini->vlan_id)) {
rc = SMC_CLC_DECL_ISMVLANERR;
goto out;
}
@@ -396,10 +777,15 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
rc = SMC_CLC_DECL_MEM;
goto ism_put_vlan;
}
+ lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0,
+ SMC_LGR_ID_SIZE, &lgr->id);
+ if (!lgr->tx_wq) {
+ rc = -ENOMEM;
+ goto free_lgr;
+ }
lgr->is_smcd = ini->is_smcd;
lgr->sync_err = 0;
lgr->terminating = 0;
- lgr->freefast = 0;
lgr->freeing = 0;
lgr->vlan_id = ini->vlan_id;
mutex_init(&lgr->sndbufs_lock);
@@ -417,13 +803,14 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
lgr->conns_all = RB_ROOT;
if (ini->is_smcd) {
/* SMC-D specific settings */
- get_device(&ini->ism_dev->dev);
- lgr->peer_gid = ini->ism_gid;
- lgr->smcd = ini->ism_dev;
- lgr_list = &ini->ism_dev->lgr_list;
+ get_device(&ini->ism_dev[ini->ism_selected]->dev);
+ lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected];
+ lgr->smcd = ini->ism_dev[ini->ism_selected];
+ lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list;
lgr_lock = &lgr->smcd->lgr_lock;
+ lgr->smc_version = ini->smcd_version;
lgr->peer_shutdown = 0;
- atomic_inc(&ini->ism_dev->lgr_cnt);
+ atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt);
} else {
/* SMC-R specific settings */
lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
@@ -437,7 +824,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
lnk = &lgr->lnk[link_idx];
rc = smcr_link_init(lgr, lnk, link_idx, ini);
if (rc)
- goto free_lgr;
+ goto free_wq;
lgr_list = &smc_lgr_list.list;
lgr_lock = &smc_lgr_list.lock;
atomic_inc(&lgr_cnt);
@@ -448,11 +835,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
spin_unlock_bh(lgr_lock);
return 0;
+free_wq:
+ destroy_workqueue(lgr->tx_wq);
free_lgr:
kfree(lgr);
ism_put_vlan:
if (ini->is_smcd && ini->vlan_id)
- smc_ism_put_vlan(ini->ism_dev, ini->vlan_id);
+ smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id);
out:
if (rc < 0) {
if (rc == -ENOMEM)
@@ -517,7 +906,7 @@ static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend,
smc->sk.sk_state != SMC_CLOSED) {
rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf);
if (!rc) {
- schedule_delayed_work(&conn->tx_work, 0);
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0);
smc->sk.sk_data_ready(&smc->sk);
}
} else {
@@ -527,6 +916,14 @@ static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend,
return rc;
}
+static void smc_switch_link_and_count(struct smc_connection *conn,
+ struct smc_link *to_lnk)
+{
+ atomic_dec(&conn->lnk->conn_cnt);
+ conn->lnk = to_lnk;
+ atomic_inc(&conn->lnk->conn_cnt);
+}
+
struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
struct smc_link *from_lnk, bool is_dev_err)
{
@@ -575,7 +972,7 @@ again:
smc->sk.sk_state == SMC_PEERABORTWAIT ||
smc->sk.sk_state == SMC_PROCESSABORT) {
spin_lock_bh(&conn->send_lock);
- conn->lnk = to_lnk;
+ smc_switch_link_and_count(conn, to_lnk);
spin_unlock_bh(&conn->send_lock);
continue;
}
@@ -589,7 +986,7 @@ again:
}
/* avoid race with smcr_tx_sndbuf_nonempty() */
spin_lock_bh(&conn->send_lock);
- conn->lnk = to_lnk;
+ smc_switch_link_and_count(conn, to_lnk);
rc = smc_switch_cursor(smc, pend, wr_buf);
spin_unlock_bh(&conn->send_lock);
sock_put(&smc->sk);
@@ -738,6 +1135,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
smc_ib_destroy_queue_pair(lnk);
smc_ib_dealloc_protection_domain(lnk);
smc_wr_free_link_mem(lnk);
+ smc_ibdev_cnt_dec(lnk);
put_device(&lnk->smcibdev->ibdev->dev);
smcibdev = lnk->smcibdev;
memset(lnk, 0, sizeof(struct smc_link));
@@ -824,11 +1222,10 @@ static void smc_lgr_free(struct smc_link_group *lgr)
}
smc_lgr_free_bufs(lgr);
+ destroy_workqueue(lgr->tx_wq);
if (lgr->is_smcd) {
- if (!lgr->terminating) {
- smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
- put_device(&lgr->smcd->dev);
- }
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ put_device(&lgr->smcd->dev);
if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
wake_up(&lgr->smcd->lgrs_deleted);
} else {
@@ -889,8 +1286,6 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr)
if (lgr->is_smcd) {
smc_ism_signal_shutdown(lgr);
smcd_unregister_all_dmbs(lgr);
- smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
- put_device(&lgr->smcd->dev);
} else {
u32 rsn = lgr->llc_termination_rsn;
@@ -1294,11 +1689,13 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
spinlock_t *lgr_lock;
int rc = 0;
- lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list;
- lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock;
- ini->cln_first_contact = SMC_FIRST_CONTACT;
+ lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list :
+ &smc_lgr_list.list;
+ lgr_lock = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_lock :
+ &smc_lgr_list.lock;
+ ini->first_contact_local = 1;
role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
- if (role == SMC_CLNT && ini->srv_first_contact)
+ if (role == SMC_CLNT && ini->first_contact_peer)
/* create new link group as well */
goto create;
@@ -1307,14 +1704,16 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
list_for_each_entry(lgr, lgr_list, list) {
write_lock_bh(&lgr->conns_lock);
if ((ini->is_smcd ?
- smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
+ smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected],
+ ini->ism_peer_gid[ini->ism_selected]) :
smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
!lgr->sync_err &&
- lgr->vlan_id == ini->vlan_id &&
+ (ini->smcd_version == SMC_V2 ||
+ lgr->vlan_id == ini->vlan_id) &&
(role == SMC_CLNT || ini->is_smcd ||
lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
/* link group found */
- ini->cln_first_contact = SMC_REUSE_CONTACT;
+ ini->first_contact_local = 0;
conn->lgr = lgr;
rc = smc_lgr_register_conn(conn, false);
write_unlock_bh(&lgr->conns_lock);
@@ -1328,8 +1727,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
if (rc)
return rc;
- if (role == SMC_CLNT && !ini->srv_first_contact &&
- ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (role == SMC_CLNT && !ini->first_contact_peer &&
+ ini->first_contact_local) {
/* Server reuses a link group, but Client wants to start
* a new one
* send out_of_sync decline, reason synchr. error
@@ -1338,7 +1737,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
}
create:
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (ini->first_contact_local) {
rc = smc_lgr_create(smc, ini);
if (rc)
goto out;
@@ -1597,7 +1996,7 @@ out:
return rc;
}
-#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
+#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
bool is_dmb, int bufsize)
@@ -1616,7 +2015,11 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
if (rc) {
kfree(buf_desc);
- return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : ERR_PTR(rc);
+ if (rc == -ENOMEM)
+ return ERR_PTR(-EAGAIN);
+ if (rc == -ENOSPC)
+ return ERR_PTR(-ENOSPC);
+ return ERR_PTR(-EIO);
}
buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
/* CDC header stored in buf. So, pretend it was smaller */
@@ -1892,8 +2295,8 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn,
struct smc_link *lnk,
struct smc_clc_msg_accept_confirm *clc)
{
- conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr,
- clc->rmb_rkey);
+ conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr,
+ clc->r0.rmb_rkey);
if (conn->rtoken_idx < 0)
return conn->rtoken_idx;
return 0;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 1c4d5439d0ff..e8e448771f85 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -13,7 +13,10 @@
#define _SMC_CORE_H
#include <linux/atomic.h>
+#include <linux/smc.h>
+#include <linux/pci.h>
#include <rdma/ib_verbs.h>
+#include <net/genetlink.h>
#include "smc.h"
#include "smc_ib.h"
@@ -124,11 +127,14 @@ struct smc_link {
u8 link_is_asym; /* is link asymmetric? */
struct smc_link_group *lgr; /* parent link group */
struct work_struct link_down_wrk; /* wrk to bring link down */
+ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */
+ int ndev_ifidx; /* network device ifindex */
enum smc_link_state state; /* state of link */
struct delayed_work llc_testlink_wrk; /* testlink worker */
struct completion llc_testlink_resp; /* wait for rx of testlink */
int llc_testlink_time; /* testlink interval */
+ atomic_t conn_cnt; /* connections on this link */
};
/* For now we just allow one parallel link per link group. The SMC protocol
@@ -137,9 +143,6 @@ struct smc_link {
#define SMC_LINKS_PER_LGR_MAX 3
#define SMC_SINGLE_LINK 0
-#define SMC_FIRST_CONTACT 1 /* first contact to a peer */
-#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
-
/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
struct smc_buf_desc {
struct list_head list;
@@ -228,12 +231,17 @@ struct smc_link_group {
u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
struct delayed_work free_work; /* delayed freeing of an lgr */
struct work_struct terminate_work; /* abnormal lgr termination */
+ struct workqueue_struct *tx_wq; /* wq for conn. tx workers */
u8 sync_err : 1; /* lgr no longer fits to peer */
u8 terminating : 1;/* lgr is terminating */
- u8 freefast : 1; /* free worker scheduled fast */
u8 freeing : 1; /* lgr is being freed */
bool is_smcd; /* SMC-R or SMC-D */
+ u8 smc_version;
+ u8 negotiated_eid[SMC_MAX_EID_LEN];
+ u8 peer_os; /* peer operating system */
+ u8 peer_smc_release;
+ u8 peer_hostname[SMC_MAX_HOSTNAME_LEN];
union {
struct { /* SMC-R */
enum smc_lgr_role role;
@@ -294,9 +302,12 @@ struct smc_clc_msg_local;
struct smc_init_info {
u8 is_smcd;
+ u8 smc_type_v1;
+ u8 smc_type_v2;
+ u8 first_contact_peer;
+ u8 first_contact_local;
unsigned short vlan_id;
- int srv_first_contact;
- int cln_first_contact;
+ u32 rc;
/* SMC-R */
struct smc_clc_msg_local *ib_lcl;
struct smc_ib_device *ib_dev;
@@ -304,8 +315,12 @@ struct smc_init_info {
u8 ib_port;
u32 ib_clcqpn;
/* SMC-D */
- u64 ism_gid;
- struct smcd_dev *ism_dev;
+ u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1];
+ struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1];
+ u16 ism_chid[SMC_MAX_ISM_DEVS + 1];
+ u8 ism_offered_cnt; /* # of ISM devices offered */
+ u8 ism_selected; /* index of selected ISM dev*/
+ u8 smcd_version;
};
/* Find the connection associated with the given alert token in the link group.
@@ -354,6 +369,45 @@ static inline bool smc_link_active(struct smc_link *lnk)
return lnk->state == SMC_LNK_ACTIVE;
}
+static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+{
+ sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+ be16_to_cpu(((__be16 *)gid_raw)[0]),
+ be16_to_cpu(((__be16 *)gid_raw)[1]),
+ be16_to_cpu(((__be16 *)gid_raw)[2]),
+ be16_to_cpu(((__be16 *)gid_raw)[3]),
+ be16_to_cpu(((__be16 *)gid_raw)[4]),
+ be16_to_cpu(((__be16 *)gid_raw)[5]),
+ be16_to_cpu(((__be16 *)gid_raw)[6]),
+ be16_to_cpu(((__be16 *)gid_raw)[7]));
+}
+
+struct smc_pci_dev {
+ __u32 pci_fid;
+ __u16 pci_pchid;
+ __u16 pci_vendor;
+ __u16 pci_device;
+ __u8 pci_id[SMC_PCI_ID_STR_LEN];
+};
+
+static inline void smc_set_pci_values(struct pci_dev *pci_dev,
+ struct smc_pci_dev *smc_dev)
+{
+ smc_dev->pci_vendor = pci_dev->vendor;
+ smc_dev->pci_device = pci_dev->device;
+ snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s",
+ pci_name(pci_dev));
+#if IS_ENABLED(CONFIG_S390)
+ { /* Set s390 specific PCI information */
+ struct zpci_dev *zdev;
+
+ zdev = to_zpci(pci_dev);
+ smc_dev->pci_fid = zdev->fid;
+ smc_dev->pci_pchid = zdev->pchid;
+ }
+#endif
+}
+
struct smc_sock;
struct smc_clc_msg_accept_confirm;
struct smc_clc_msg_local;
@@ -401,6 +455,10 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
struct smc_link *from_lnk, bool is_dev_err);
void smcr_link_down_cond(struct smc_link *lnk);
void smcr_link_down_cond_sched(struct smc_link *lnk);
+int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb);
+int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb);
+int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb);
+int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb);
static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
{
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index da9ba6d1679b..c952986a6aca 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -22,17 +22,13 @@
#include "smc.h"
#include "smc_core.h"
-static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+struct smc_diag_dump_ctx {
+ int pos[2];
+};
+
+static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb)
{
- sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
- be16_to_cpu(((__be16 *)gid_raw)[0]),
- be16_to_cpu(((__be16 *)gid_raw)[1]),
- be16_to_cpu(((__be16 *)gid_raw)[2]),
- be16_to_cpu(((__be16 *)gid_raw)[3]),
- be16_to_cpu(((__be16 *)gid_raw)[4]),
- be16_to_cpu(((__be16 *)gid_raw)[5]),
- be16_to_cpu(((__be16 *)gid_raw)[6]),
- be16_to_cpu(((__be16 *)gid_raw)[7]));
+ return (struct smc_diag_dump_ctx *)cb->ctx;
}
static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
@@ -151,17 +147,17 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
!list_empty(&smc->conn.lgr->list)) {
struct smc_diag_lgrinfo linfo = {
.role = smc->conn.lgr->role,
- .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
- .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
+ .lnk[0].ibport = smc->conn.lnk->ibport,
+ .lnk[0].link_id = smc->conn.lnk->link_id,
};
memcpy(linfo.lnk[0].ibname,
smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
- sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
+ sizeof(smc->conn.lnk->smcibdev->ibdev->name));
smc_gid_be16_convert(linfo.lnk[0].gid,
- smc->conn.lgr->lnk[0].gid);
+ smc->conn.lnk->gid);
smc_gid_be16_convert(linfo.lnk[0].peer_gid,
- smc->conn.lgr->lnk[0].peer_gid);
+ smc->conn.lnk->peer_gid);
if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
goto errout;
@@ -193,13 +189,15 @@ errout:
}
static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb, int p_type)
{
+ struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb);
struct net *net = sock_net(skb->sk);
+ int snum = cb_ctx->pos[p_type];
struct nlattr *bc = NULL;
struct hlist_head *head;
+ int rc = 0, num = 0;
struct sock *sk;
- int rc = 0;
read_lock(&prot->h.smc_hash->lock);
head = &prot->h.smc_hash->ht;
@@ -209,13 +207,18 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
sk_for_each(sk, head) {
if (!net_eq(sock_net(sk), net))
continue;
+ if (num < snum)
+ goto next;
rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
- if (rc)
- break;
+ if (rc < 0)
+ goto out;
+next:
+ num++;
}
out:
read_unlock(&prot->h.smc_hash->lock);
+ cb_ctx->pos[p_type] = num;
return rc;
}
@@ -223,10 +226,10 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int rc = 0;
- rc = smc_diag_dump_proto(&smc_proto, skb, cb);
+ rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC);
if (!rc)
- rc = smc_diag_dump_proto(&smc_proto6, skb, cb);
- return rc;
+ smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6);
+ return skb->len;
}
static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 1c314dbdc7fa..7d7ba0320d5a 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -25,6 +25,7 @@
#include "smc_core.h"
#include "smc_wr.h"
#include "smc.h"
+#include "smc_netlink.h"
#define SMC_MAX_CQE 32766 /* max. # of completion queue elements */
@@ -198,9 +199,9 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
rcu_read_lock();
ndev = rdma_read_gid_attr_ndev_rcu(attr);
if (!IS_ERR(ndev) &&
- ((!vlan_id && !is_vlan_dev(attr->ndev)) ||
- (vlan_id && is_vlan_dev(attr->ndev) &&
- vlan_dev_vlan_id(attr->ndev) == vlan_id)) &&
+ ((!vlan_id && !is_vlan_dev(ndev)) ||
+ (vlan_id && is_vlan_dev(ndev) &&
+ vlan_dev_vlan_id(ndev) == vlan_id)) &&
attr->gid_type == IB_GID_TYPE_ROCE) {
rcu_read_unlock();
if (gid)
@@ -326,6 +327,171 @@ int smc_ib_create_protection_domain(struct smc_link *lnk)
return rc;
}
+static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr,
+ struct smc_ib_device *smcibdev)
+{
+ struct smc_link_group *lgr;
+ bool rc = false;
+ int i;
+
+ spin_lock_bh(&smc_lgr->lock);
+ list_for_each_entry(lgr, &smc_lgr->list, list) {
+ if (lgr->is_smcd)
+ continue;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
+ lgr->lnk[i].smcibdev != smcibdev)
+ continue;
+ if (lgr->type == SMC_LGR_SINGLE ||
+ lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) {
+ rc = true;
+ goto out;
+ }
+ }
+ }
+out:
+ spin_unlock_bh(&smc_lgr->lock);
+ return rc;
+}
+
+static int smc_nl_handle_dev_port(struct sk_buff *skb,
+ struct ib_device *ibdev,
+ struct smc_ib_device *smcibdev,
+ int port)
+{
+ char smc_pnet[SMC_MAX_PNETID_LEN + 1];
+ struct nlattr *port_attrs;
+ unsigned char port_state;
+ int lnk_count = 0;
+
+ port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port);
+ if (!port_attrs)
+ goto errout;
+
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR,
+ smcibdev->pnetid_by_user[port]))
+ goto errattr;
+ memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN);
+ smc_pnet[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV,
+ smcibdev->ndev_ifidx[port]))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1))
+ goto errattr;
+ port_state = smc_ib_port_active(smcibdev, port + 1);
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state))
+ goto errattr;
+ lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]);
+ if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count))
+ goto errattr;
+ nla_nest_end(skb, port_attrs);
+ return 0;
+errattr:
+ nla_nest_cancel(skb, port_attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev,
+ struct sk_buff *skb)
+{
+ if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid))
+ return false;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid))
+ return false;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor))
+ return false;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device))
+ return false;
+ if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id))
+ return false;
+ return true;
+}
+
+static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_ibname[IB_DEVICE_NAME_MAX];
+ struct smc_pci_dev smc_pci_dev;
+ struct pci_dev *pci_dev;
+ unsigned char is_crit;
+ struct nlattr *attrs;
+ void *nlh;
+ int i;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_DEV_SMCR);
+ if (!nlh)
+ goto errmsg;
+ attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR);
+ if (!attrs)
+ goto errout;
+ is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev);
+ if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit))
+ goto errattr;
+ if (smcibdev->ibdev->dev.parent) {
+ memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
+ pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent);
+ smc_set_pci_values(pci_dev, &smc_pci_dev);
+ if (!smc_nl_handle_pci_values(&smc_pci_dev, skb))
+ goto errattr;
+ }
+ snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name);
+ if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname))
+ goto errattr;
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!rdma_is_port_valid(smcibdev->ibdev, i))
+ continue;
+ if (smc_nl_handle_dev_port(skb, smcibdev->ibdev,
+ smcibdev, i - 1))
+ goto errattr;
+ }
+
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smc_ib_device *smcibdev;
+ int snum = cb_ctx->pos[0];
+ int num = 0;
+
+ mutex_lock(&dev_list->mutex);
+ list_for_each_entry(smcibdev, &dev_list->list, list) {
+ if (num < snum)
+ goto next;
+ if (smc_nl_handle_smcr_dev(smcibdev, skb, cb))
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ mutex_unlock(&dev_list->mutex);
+ cb_ctx->pos[0] = num;
+}
+
+int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb);
+ return skb->len;
+}
+
static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
{
struct smc_link *lnk = (struct smc_link *)priv;
@@ -557,6 +723,49 @@ out:
static struct ib_client smc_ib_client;
+static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port)
+{
+ struct ib_device *ibdev = smcibdev->ibdev;
+ struct net_device *ndev;
+
+ if (!ibdev->ops.get_netdev)
+ return;
+ ndev = ibdev->ops.get_netdev(ibdev, port + 1);
+ if (ndev) {
+ smcibdev->ndev_ifidx[port] = ndev->ifindex;
+ dev_put(ndev);
+ }
+}
+
+void smc_ib_ndev_change(struct net_device *ndev, unsigned long event)
+{
+ struct smc_ib_device *smcibdev;
+ struct ib_device *libdev;
+ struct net_device *lndev;
+ u8 port_cnt;
+ int i;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
+ port_cnt = smcibdev->ibdev->phys_port_cnt;
+ for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) {
+ libdev = smcibdev->ibdev;
+ if (!libdev->ops.get_netdev)
+ continue;
+ lndev = libdev->ops.get_netdev(libdev, i + 1);
+ if (lndev)
+ dev_put(lndev);
+ if (lndev != ndev)
+ continue;
+ if (event == NETDEV_REGISTER)
+ smcibdev->ndev_ifidx[i] = ndev->ifindex;
+ if (event == NETDEV_UNREGISTER)
+ smcibdev->ndev_ifidx[i] = 0;
+ }
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+}
+
/* callback function for ib_register_client() */
static int smc_ib_add_dev(struct ib_device *ibdev)
{
@@ -596,6 +805,7 @@ static int smc_ib_add_dev(struct ib_device *ibdev)
if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
smcibdev->pnetid[i]))
smc_pnetid_by_table_ib(smcibdev, i + 1);
+ smc_copy_netdev_ifindex(smcibdev, i);
pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
"%.16s%s\n",
smcibdev->ibdev->name, i + 1,
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 2ce481187dd0..3085f5180da7 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -30,6 +30,7 @@ struct smc_ib_devices { /* list of smc ib devices definition */
};
extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
+extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */
struct smc_ib_device { /* ib-device infos for smc */
struct list_head list;
@@ -53,11 +54,15 @@ struct smc_ib_device { /* ib-device infos for smc */
atomic_t lnk_cnt; /* number of links on ibdev */
wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/
struct mutex mutex; /* protect dev setup+cleanup */
+ atomic_t lnk_cnt_by_port[SMC_MAX_PORTS];
+ /* number of links per port */
+ int ndev_ifidx[SMC_MAX_PORTS]; /* ndev if indexes */
};
struct smc_buf_desc;
struct smc_link;
+void smc_ib_ndev_change(struct net_device *ndev, unsigned long event);
int smc_ib_register_client(void) __init;
void smc_ib_unregister_client(void);
bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
@@ -87,4 +92,5 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk,
int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
unsigned short vlan_id, u8 gid[], u8 *sgid_index);
bool smc_ib_is_valid_local_systemid(void);
+int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb);
#endif
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 998c525de785..9c6e95882553 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -15,13 +15,16 @@
#include "smc_core.h"
#include "smc_ism.h"
#include "smc_pnet.h"
+#include "smc_netlink.h"
struct smcd_dev_list smcd_dev_list = {
.list = LIST_HEAD_INIT(smcd_dev_list.list),
.mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex)
};
-/* Test if an ISM communication is possible. */
+static bool smc_ism_v2_capable;
+
+/* Test if an ISM communication is possible - same CPC */
int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd)
{
return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0,
@@ -39,6 +42,22 @@ int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos,
return rc < 0 ? rc : 0;
}
+void smc_ism_get_system_eid(struct smcd_dev *smcd, u8 **eid)
+{
+ smcd->ops->get_system_eid(smcd, eid);
+}
+
+u16 smc_ism_get_chid(struct smcd_dev *smcd)
+{
+ return smcd->ops->get_chid(smcd);
+}
+
+/* HW supports ISM V2 and thus System EID is defined */
+bool smc_ism_is_v2_capable(void)
+{
+ return smc_ism_v2_capable;
+}
+
/* Set a connection using this DMBE. */
void smc_ism_set_conn(struct smc_connection *conn)
{
@@ -189,6 +208,97 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
return rc;
}
+static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_pnet[SMC_MAX_PNETID_LEN + 1];
+ struct smc_pci_dev smc_pci_dev;
+ struct nlattr *port_attrs;
+ struct nlattr *attrs;
+ int use_cnt = 0;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_DEV_SMCD);
+ if (!nlh)
+ goto errmsg;
+ attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD);
+ if (!attrs)
+ goto errout;
+ use_cnt = atomic_read(&smcd->lgr_cnt);
+ if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0))
+ goto errattr;
+ memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
+ smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev);
+ if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device))
+ goto errattr;
+ if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id))
+ goto errattr;
+
+ port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT);
+ if (!port_attrs)
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user))
+ goto errportattr;
+ memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN);
+ smc_pnet[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
+ goto errportattr;
+
+ nla_nest_end(skb, port_attrs);
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+
+errportattr:
+ nla_nest_cancel(skb, port_attrs);
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ nlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ int snum = cb_ctx->pos[0];
+ struct smcd_dev *smcd;
+ int num = 0;
+
+ mutex_lock(&dev_list->mutex);
+ list_for_each_entry(smcd, &dev_list->list, list) {
+ if (num < snum)
+ goto next;
+ if (smc_nl_handle_smcd_dev(smcd, skb, cb))
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ mutex_unlock(&dev_list->mutex);
+ cb_ctx->pos[0] = num;
+}
+
+int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb);
+ return skb->len;
+}
+
struct smc_ism_event_work {
struct work_struct work;
struct smcd_dev *smcd;
@@ -319,7 +429,18 @@ EXPORT_SYMBOL_GPL(smcd_alloc_dev);
int smcd_register_dev(struct smcd_dev *smcd)
{
mutex_lock(&smcd_dev_list.mutex);
- list_add_tail(&smcd->list, &smcd_dev_list.list);
+ if (list_empty(&smcd_dev_list.list)) {
+ u8 *system_eid = NULL;
+
+ smc_ism_get_system_eid(smcd, &system_eid);
+ if (system_eid[24] != '0' || system_eid[28] != '0')
+ smc_ism_v2_capable = true;
+ }
+ /* sort list: devices without pnetid before devices with pnetid */
+ if (smcd->pnetid[0])
+ list_add_tail(&smcd->list, &smcd_dev_list.list);
+ else
+ list_add(&smcd->list, &smcd_dev_list.list);
mutex_unlock(&smcd_dev_list.mutex);
pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
@@ -399,3 +520,8 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
spin_unlock_irqrestore(&smcd->lock, flags);
}
EXPORT_SYMBOL_GPL(smcd_handle_irq);
+
+void __init smc_ism_init(void)
+{
+ smc_ism_v2_capable = false;
+}
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
index 81cc4537efd3..113efc7352ed 100644
--- a/net/smc/smc_ism.h
+++ b/net/smc/smc_ism.h
@@ -10,6 +10,7 @@
#define SMCD_ISM_H
#include <linux/uio.h>
+#include <linux/types.h>
#include <linux/mutex.h>
#include "smc.h"
@@ -19,7 +20,7 @@ struct smcd_dev_list { /* List of SMCD devices */
struct mutex mutex; /* Protects list of devices */
};
-extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */
+extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */
struct smc_ism_vlanid { /* VLAN id set on ISM device */
struct list_head list;
@@ -47,4 +48,9 @@ int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos,
void *data, size_t len);
int smc_ism_signal_shutdown(struct smc_link_group *lgr);
+void smc_ism_get_system_eid(struct smcd_dev *dev, u8 **eid);
+u16 smc_ism_get_chid(struct smcd_dev *dev);
+bool smc_ism_is_v2_capable(void);
+void smc_ism_init(void);
+int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb);
#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 3ea33466ebe9..273eaf1bfe49 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -233,8 +233,6 @@ static bool smc_llc_flow_start(struct smc_llc_flow *flow,
default:
flow->type = SMC_LLC_FLOW_NONE;
}
- if (qentry == lgr->delayed_event)
- lgr->delayed_event = NULL;
smc_llc_flow_qentry_set(flow, qentry);
spin_unlock_bh(&lgr->llc_flow_lock);
return true;
@@ -1209,7 +1207,7 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
/* enqueue a local add_link req to trigger a new add_link flow */
void smc_llc_add_link_local(struct smc_link *link)
{
- struct smc_llc_msg_add_link add_llc = {0};
+ struct smc_llc_msg_add_link add_llc = {};
add_llc.hd.length = sizeof(add_llc);
add_llc.hd.common.type = SMC_LLC_ADD_LINK;
@@ -1242,7 +1240,7 @@ out:
*/
void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id)
{
- struct smc_llc_msg_del_link del_llc = {0};
+ struct smc_llc_msg_del_link del_llc = {};
del_llc.hd.length = sizeof(del_llc);
del_llc.hd.common.type = SMC_LLC_DELETE_LINK;
@@ -1314,7 +1312,7 @@ out:
*/
void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn)
{
- struct smc_llc_msg_del_link delllc = {0};
+ struct smc_llc_msg_del_link delllc = {};
int i;
delllc.hd.common.type = SMC_LLC_DELETE_LINK;
@@ -1603,13 +1601,12 @@ static void smc_llc_event_work(struct work_struct *work)
struct smc_llc_qentry *qentry;
if (!lgr->llc_flow_lcl.type && lgr->delayed_event) {
- if (smc_link_usable(lgr->delayed_event->link)) {
- smc_llc_event_handler(lgr->delayed_event);
- } else {
- qentry = lgr->delayed_event;
- lgr->delayed_event = NULL;
+ qentry = lgr->delayed_event;
+ lgr->delayed_event = NULL;
+ if (smc_link_usable(qentry->link))
+ smc_llc_event_handler(qentry);
+ else
kfree(qentry);
- }
}
again:
@@ -1691,7 +1688,7 @@ static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc)
spin_lock_irqsave(&lgr->llc_event_q_lock, flags);
list_add_tail(&qentry->list, &lgr->llc_event_q);
spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags);
- schedule_work(&lgr->llc_event_work);
+ queue_work(system_highpri_wq, &lgr->llc_event_work);
}
/* copy received msg and add it to the event queue */
diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c
new file mode 100644
index 000000000000..140419a19dbf
--- /dev/null
+++ b/net/smc/smc_netlink.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Generic netlink support functions to interact with SMC module
+ *
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Guvenc Gulce <guvenc@linux.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/if.h>
+#include <linux/smc.h>
+
+#include "smc_core.h"
+#include "smc_ism.h"
+#include "smc_ib.h"
+#include "smc_netlink.h"
+
+#define SMC_CMD_MAX_ATTR 1
+
+/* SMC_GENL generic netlink operation definition */
+static const struct genl_ops smc_gen_nl_ops[] = {
+ {
+ .cmd = SMC_NETLINK_GET_SYS_INFO,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_get_sys_info,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_LGR_SMCR,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcr_nl_get_lgr,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_LINK_SMCR,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcr_nl_get_link,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_LGR_SMCD,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcd_nl_get_lgr,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_DEV_SMCD,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcd_nl_get_device,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_DEV_SMCR,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcr_nl_get_device,
+ },
+};
+
+static const struct nla_policy smc_gen_nl_policy[2] = {
+ [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, },
+};
+
+/* SMC_GENL family definition */
+struct genl_family smc_gen_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = SMC_GENL_FAMILY_NAME,
+ .version = SMC_GENL_FAMILY_VERSION,
+ .maxattr = SMC_CMD_MAX_ATTR,
+ .policy = smc_gen_nl_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = smc_gen_nl_ops,
+ .n_ops = ARRAY_SIZE(smc_gen_nl_ops)
+};
+
+int __init smc_nl_init(void)
+{
+ return genl_register_family(&smc_gen_nl_family);
+}
+
+void smc_nl_exit(void)
+{
+ genl_unregister_family(&smc_gen_nl_family);
+}
diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h
new file mode 100644
index 000000000000..3477265cba6c
--- /dev/null
+++ b/net/smc/smc_netlink.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * SMC Generic netlink operations
+ *
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Guvenc Gulce <guvenc@linux.ibm.com>
+ */
+
+#ifndef _SMC_NETLINK_H
+#define _SMC_NETLINK_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+extern struct genl_family smc_gen_nl_family;
+
+struct smc_nl_dmp_ctx {
+ int pos[2];
+};
+
+static inline struct smc_nl_dmp_ctx *smc_nl_dmp_ctx(struct netlink_callback *c)
+{
+ return (struct smc_nl_dmp_ctx *)c->ctx;
+}
+
+int smc_nl_init(void) __init;
+void smc_nl_exit(void);
+
+#endif
diff --git a/net/smc/smc_netns.h b/net/smc/smc_netns.h
index e7a8fc4ae02f..0f4f35aa43ad 100644
--- a/net/smc/smc_netns.h
+++ b/net/smc/smc_netns.h
@@ -16,5 +16,6 @@ extern unsigned int smc_net_id;
/* per-network namespace private data */
struct smc_net {
struct smc_pnettable pnettable;
+ struct smc_pnetids_ndev pnetids_ndev;
};
#endif
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 30e5fac7034e..6f6d33edb135 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -29,8 +29,7 @@
#include "smc_ism.h"
#include "smc_core.h"
-#define SMC_ASCII_BLANK 32
-
+static struct net_device *__pnet_find_base_ndev(struct net_device *ndev);
static struct net_device *pnet_find_base_ndev(struct net_device *ndev);
static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
@@ -73,14 +72,22 @@ struct smc_pnetentry {
};
};
+/* Check if the pnetid is set */
+bool smc_pnet_is_pnetid_set(u8 *pnetid)
+{
+ if (pnetid[0] == 0 || pnetid[0] == _S)
+ return false;
+ return true;
+}
+
/* Check if two given pnetids match */
static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2)
{
int i;
for (i = 0; i < SMC_MAX_PNETID_LEN; i++) {
- if ((pnetid1[i] == 0 || pnetid1[i] == SMC_ASCII_BLANK) &&
- (pnetid2[i] == 0 || pnetid2[i] == SMC_ASCII_BLANK))
+ if ((pnetid1[i] == 0 || pnetid1[i] == _S) &&
+ (pnetid2[i] == 0 || pnetid2[i] == _S))
break;
if (pnetid1[i] != pnetid2[i])
return false;
@@ -238,11 +245,10 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev)
static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port,
char *pnet_name)
{
- u8 pnet_null[SMC_MAX_PNETID_LEN] = {0};
bool applied = false;
mutex_lock(&smc_ib_devices.mutex);
- if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) {
+ if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) {
memcpy(ib_dev->pnetid[ib_port - 1], pnet_name,
SMC_MAX_PNETID_LEN);
ib_dev->pnetid_by_user[ib_port - 1] = true;
@@ -256,11 +262,10 @@ static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port,
*/
static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name)
{
- u8 pnet_null[SMC_MAX_PNETID_LEN] = {0};
bool applied = false;
mutex_lock(&smcd_dev_list.mutex);
- if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) {
+ if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) {
memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN);
smcd_dev->pnetid_by_user = true;
applied = true;
@@ -708,18 +713,136 @@ static struct genl_family smc_pnet_nl_family __ro_after_init = {
.n_ops = ARRAY_SIZE(smc_pnet_ops)
};
+bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe;
+ bool rc = false;
+
+ read_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry(pe, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ rc = true;
+ goto unlock;
+ }
+ }
+
+unlock:
+ read_unlock(&sn->pnetids_ndev.lock);
+ return rc;
+}
+
+static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *pi;
+
+ pe = kzalloc(sizeof(*pe), GFP_KERNEL);
+ if (!pe)
+ return -ENOMEM;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry(pi, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ refcount_inc(&pi->refcnt);
+ kfree(pe);
+ goto unlock;
+ }
+ }
+ refcount_set(&pe->refcnt, 1);
+ memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN);
+ list_add_tail(&pe->list, &sn->pnetids_ndev.list);
+
+unlock:
+ write_unlock(&sn->pnetids_ndev.lock);
+ return 0;
+}
+
+static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *pe2;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ if (refcount_dec_and_test(&pe->refcnt)) {
+ list_del(&pe->list);
+ kfree(pe);
+ }
+ break;
+ }
+ }
+ write_unlock(&sn->pnetids_ndev.lock);
+}
+
+static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev,
+ u8 *ndev_pnetid)
+{
+ struct net_device *base_dev;
+
+ base_dev = __pnet_find_base_ndev(dev);
+ if (base_dev->flags & IFF_UP &&
+ !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port,
+ ndev_pnetid)) {
+ /* add to PNETIDs list */
+ smc_pnet_add_pnetid(net, ndev_pnetid);
+ }
+}
+
+/* create initial list of netdevice pnetids */
+static void smc_pnet_create_pnetids_list(struct net *net)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev)
+ smc_pnet_add_base_pnetid(net, dev, ndev_pnetid);
+ rtnl_unlock();
+}
+
+/* clean up list of netdevice pnetids */
+static void smc_pnet_destroy_pnetids_list(struct net *net)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *temp_pe;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) {
+ list_del(&pe->list);
+ kfree(pe);
+ }
+ write_unlock(&sn->pnetids_ndev.lock);
+}
+
static int smc_pnet_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(event_dev);
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
switch (event) {
case NETDEV_REBOOT:
case NETDEV_UNREGISTER:
smc_pnet_remove_by_ndev(event_dev);
+ smc_ib_ndev_change(event_dev, event);
return NOTIFY_OK;
case NETDEV_REGISTER:
smc_pnet_add_by_ndev(event_dev);
+ smc_ib_ndev_change(event_dev, event);
+ return NOTIFY_OK;
+ case NETDEV_UP:
+ smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid);
+ return NOTIFY_OK;
+ case NETDEV_DOWN:
+ event_dev = __pnet_find_base_ndev(event_dev);
+ if (!smc_pnetid_by_dev_port(event_dev->dev.parent,
+ event_dev->dev_port, ndev_pnetid)) {
+ /* remove from PNETIDs list */
+ smc_pnet_remove_pnetid(net, ndev_pnetid);
+ }
return NOTIFY_OK;
default:
return NOTIFY_DONE;
@@ -735,9 +858,14 @@ int smc_pnet_net_init(struct net *net)
{
struct smc_net *sn = net_generic(net, smc_net_id);
struct smc_pnettable *pnettable = &sn->pnettable;
+ struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev;
INIT_LIST_HEAD(&pnettable->pnetlist);
rwlock_init(&pnettable->lock);
+ INIT_LIST_HEAD(&pnetids_ndev->list);
+ rwlock_init(&pnetids_ndev->lock);
+
+ smc_pnet_create_pnetids_list(net);
return 0;
}
@@ -752,6 +880,7 @@ int __init smc_pnet_init(void)
rc = register_netdevice_notifier(&smc_netdev_notifier);
if (rc)
genl_unregister_family(&smc_pnet_nl_family);
+
return rc;
}
@@ -760,6 +889,7 @@ void smc_pnet_net_exit(struct net *net)
{
/* flush pnet table */
smc_pnet_remove_by_pnetid(net, NULL);
+ smc_pnet_destroy_pnetids_list(net);
}
void smc_pnet_exit(void)
@@ -768,16 +898,11 @@ void smc_pnet_exit(void)
genl_unregister_family(&smc_pnet_nl_family);
}
-/* Determine one base device for stacked net devices.
- * If the lower device level contains more than one devices
- * (for instance with bonding slaves), just the first device
- * is used to reach a base device.
- */
-static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
+static struct net_device *__pnet_find_base_ndev(struct net_device *ndev)
{
int i, nest_lvl;
- rtnl_lock();
+ ASSERT_RTNL();
nest_lvl = ndev->lower_level;
for (i = 0; i < nest_lvl; i++) {
struct list_head *lower = &ndev->adj_list.lower;
@@ -787,6 +912,18 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
lower = lower->next;
ndev = netdev_lower_get_next(ndev, &lower);
}
+ return ndev;
+}
+
+/* Determine one base device for stacked net devices.
+ * If the lower device level contains more than one devices
+ * (for instance with bonding slaves), just the first device
+ * is used to reach a base device.
+ */
+static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
+{
+ rtnl_lock();
+ ndev = __pnet_find_base_ndev(ndev);
rtnl_unlock();
return ndev;
}
@@ -928,8 +1065,11 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
mutex_lock(&smcd_dev_list.mutex);
list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) &&
- !ismdev->going_away) {
- ini->ism_dev = ismdev;
+ !ismdev->going_away &&
+ (!ini->ism_peer_gid[0] ||
+ !smc_ism_cantalk(ini->ism_peer_gid[0], ini->vlan_id,
+ ismdev))) {
+ ini->ism_dev[0] = ismdev;
break;
}
}
@@ -963,7 +1103,7 @@ void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini)
{
struct dst_entry *dst = sk_dst_get(sk);
- ini->ism_dev = NULL;
+ ini->ism_dev[0] = NULL;
if (!dst)
goto out;
if (!dst->dev)
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
index 811a65986691..14039272f7e4 100644
--- a/net/smc/smc_pnet.h
+++ b/net/smc/smc_pnet.h
@@ -12,6 +12,8 @@
#ifndef _SMC_PNET_H
#define _SMC_PNET_H
+#include <net/smc.h>
+
#if IS_ENABLED(CONFIG_HAVE_PNETID)
#include <asm/pnet.h>
#endif
@@ -31,6 +33,17 @@ struct smc_pnettable {
struct list_head pnetlist;
};
+struct smc_pnetids_ndev { /* list of pnetids for net devices in UP state*/
+ struct list_head list;
+ rwlock_t lock;
+};
+
+struct smc_pnetids_ndev_entry {
+ struct list_head list;
+ u8 pnetid[SMC_MAX_PNETID_LEN];
+ refcount_t refcnt;
+};
+
static inline int smc_pnetid_by_dev_port(struct device *dev,
unsigned short port, u8 *pnetid)
{
@@ -52,4 +65,6 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcd);
void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
struct smc_init_info *ini,
struct smc_ib_device *known_dev);
+bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid);
+bool smc_pnet_is_pnetid_set(u8 *pnetid);
#endif
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 54ba0443847e..4532c16bf85e 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -228,8 +228,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
/* for a corked socket defer the RDMA writes if there
* is still sufficient sndbuf_space available
*/
- schedule_delayed_work(&conn->tx_work,
- SMC_TX_CORK_DELAY);
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
+ SMC_TX_CORK_DELAY);
else
smc_tx_sndbuf_nonempty(conn);
} /* while (msg_data_left(msg)) */
@@ -499,7 +499,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
if (conn->killed)
return -EPIPE;
rc = 0;
- mod_delayed_work(system_wq, &conn->tx_work,
+ mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
SMC_TX_WORK_DELAY);
}
return rc;
@@ -623,8 +623,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
return;
if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
!conn->killed) {
- schedule_delayed_work(&conn->tx_work,
- SMC_TX_WORK_DELAY);
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
return;
}
}
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 1e23cdd41eb1..cbc73a7e4d59 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -131,9 +131,9 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
wake_up(&link->wr_tx_wait);
}
-static void smc_wr_tx_tasklet_fn(unsigned long data)
+static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
{
- struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
int i = 0, rc;
int polled = 0;
@@ -435,9 +435,9 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
}
}
-static void smc_wr_rx_tasklet_fn(unsigned long data)
+static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
{
- struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
int polled = 0;
int rc;
@@ -698,10 +698,8 @@ void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
void smc_wr_add_dev(struct smc_ib_device *smcibdev)
{
- tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
- (unsigned long)smcibdev);
- tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
- (unsigned long)smcibdev);
+ tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
+ tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
}
int smc_wr_create_link(struct smc_link *lnk)
diff --git a/net/socket.c b/net/socket.c
index 58cac2da5f66..33e8b6c4e1d3 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -52,6 +52,7 @@
* Based upon Swansea University Computer Society NET3.039
*/
+#include <linux/ethtool.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/file.h>
@@ -64,7 +65,6 @@
#include <linux/seq_file.h>
#include <linux/mutex.h>
#include <linux/if_bridge.h>
-#include <linux/if_frad.h>
#include <linux/if_vlan.h>
#include <linux/ptp_classify.h>
#include <linux/init.h>
@@ -445,17 +445,15 @@ static int sock_map_fd(struct socket *sock, int flags)
/**
* sock_from_file - Return the &socket bounded to @file.
* @file: file
- * @err: pointer to an error code return
*
- * On failure returns %NULL and assigns -ENOTSOCK to @err.
+ * On failure returns %NULL.
*/
-struct socket *sock_from_file(struct file *file, int *err)
+struct socket *sock_from_file(struct file *file)
{
if (file->f_op == &socket_file_ops)
return file->private_data; /* set in sock_map_fd */
- *err = -ENOTSOCK;
return NULL;
}
EXPORT_SYMBOL(sock_from_file);
@@ -484,9 +482,11 @@ struct socket *sockfd_lookup(int fd, int *err)
return NULL;
}
- sock = sock_from_file(file, err);
- if (!sock)
+ sock = sock_from_file(file);
+ if (!sock) {
+ *err = -ENOTSOCK;
fput(file);
+ }
return sock;
}
EXPORT_SYMBOL(sockfd_lookup);
@@ -498,11 +498,12 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
*err = -EBADF;
if (f.file) {
- sock = sock_from_file(f.file, err);
+ sock = sock_from_file(f.file);
if (likely(sock)) {
*fput_needed = f.flags & FDPUT_FPUT;
return sock;
}
+ *err = -ENOTSOCK;
fdput(f);
}
return NULL;
@@ -1027,17 +1028,6 @@ void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
}
EXPORT_SYMBOL(vlan_ioctl_set);
-static DEFINE_MUTEX(dlci_ioctl_mutex);
-static int (*dlci_ioctl_hook) (unsigned int, void __user *);
-
-void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
-{
- mutex_lock(&dlci_ioctl_mutex);
- dlci_ioctl_hook = hook;
- mutex_unlock(&dlci_ioctl_mutex);
-}
-EXPORT_SYMBOL(dlci_ioctl_set);
-
static long sock_do_ioctl(struct net *net, struct socket *sock,
unsigned int cmd, unsigned long arg)
{
@@ -1156,17 +1146,6 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
err = vlan_ioctl_hook(net, argp);
mutex_unlock(&vlan_ioctl_mutex);
break;
- case SIOCADDDLCI:
- case SIOCDELDLCI:
- err = -ENOPKG;
- if (!dlci_ioctl_hook)
- request_module("dlci");
-
- mutex_lock(&dlci_ioctl_mutex);
- if (dlci_ioctl_hook)
- err = dlci_ioctl_hook(cmd, argp);
- mutex_unlock(&dlci_ioctl_mutex);
- break;
case SIOCGSKNS:
err = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -1715,9 +1694,11 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
- sock = sock_from_file(file, &err);
- if (!sock)
+ sock = sock_from_file(file);
+ if (!sock) {
+ err = -ENOTSOCK;
goto out;
+ }
err = -ENFILE;
newsock = sock_alloc();
@@ -1840,9 +1821,11 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
struct socket *sock;
int err;
- sock = sock_from_file(file, &err);
- if (!sock)
+ sock = sock_from_file(file);
+ if (!sock) {
+ err = -ENOTSOCK;
goto out;
+ }
err =
security_socket_connect(sock, (struct sockaddr *)address, addrlen);
@@ -2192,6 +2175,17 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
* Shutdown a socket.
*/
+int __sys_shutdown_sock(struct socket *sock, int how)
+{
+ int err;
+
+ err = security_socket_shutdown(sock, how);
+ if (!err)
+ err = sock->ops->shutdown(sock, how);
+
+ return err;
+}
+
int __sys_shutdown(int fd, int how)
{
int err, fput_needed;
@@ -2199,9 +2193,7 @@ int __sys_shutdown(int fd, int how)
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock != NULL) {
- err = security_socket_shutdown(sock, how);
- if (!err)
- err = sock->ops->shutdown(sock, how);
+ err = __sys_shutdown_sock(sock, how);
fput_light(sock->file, fput_needed);
}
return err;
@@ -2628,9 +2620,11 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
struct user_msghdr __user *umsg,
struct sockaddr __user *uaddr, unsigned int flags)
{
- /* disallow ancillary data requests from this path */
- if (msg->msg_control || msg->msg_controllen)
- return -EINVAL;
+ if (msg->msg_control || msg->msg_controllen) {
+ /* disallow ancillary data reqs unless cmsg is plain data */
+ if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY))
+ return -EINVAL;
+ }
return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
}
@@ -3425,8 +3419,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
case SIOCBRDELBR:
case SIOCGIFVLAN:
case SIOCSIFVLAN:
- case SIOCADDDLCI:
- case SIOCDELDLCI:
case SIOCGSKNS:
case SIOCGSTAMP_NEW:
case SIOCGSTAMPNS_NEW:
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 3bcf985507be..bbbb5af0af13 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -21,7 +21,6 @@ config RPCSEC_GSS_KRB5
depends on SUNRPC && CRYPTO
depends on CRYPTO_MD5 && CRYPTO_DES && CRYPTO_CBC && CRYPTO_CTS
depends on CRYPTO_ECB && CRYPTO_HMAC && CRYPTO_SHA1 && CRYPTO_AES
- depends on CRYPTO_ARC4
default y
select SUNRPC_GSS
help
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index 010dcb876f9d..6e4dbd577a39 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -185,7 +185,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
scope_id = dev->ifindex;
dev_put(dev);
} else {
- if (kstrtou32(p, 10, &scope_id) == 0) {
+ if (kstrtou32(p, 10, &scope_id) != 0) {
kfree(p);
return 0;
}
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 4ecc2a959567..5f42aa5fc612 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -29,6 +29,7 @@
#include <linux/uaccess.h>
#include <linux/hashtable.h>
+#include "auth_gss_internal.h"
#include "../netns.h"
#include <trace/events/rpcgss.h>
@@ -125,35 +126,6 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
}
-static const void *
-simple_get_bytes(const void *p, const void *end, void *res, size_t len)
-{
- const void *q = (const void *)((const char *)p + len);
- if (unlikely(q > end || q < p))
- return ERR_PTR(-EFAULT);
- memcpy(res, p, len);
- return q;
-}
-
-static inline const void *
-simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
-{
- const void *q;
- unsigned int len;
-
- p = simple_get_bytes(p, end, &len, sizeof(len));
- if (IS_ERR(p))
- return p;
- q = (const void *)((const char *)p + len);
- if (unlikely(q > end || q < p))
- return ERR_PTR(-EFAULT);
- dest->data = kmemdup(p, len, GFP_NOFS);
- if (unlikely(dest->data == NULL))
- return ERR_PTR(-ENOMEM);
- dest->len = len;
- return q;
-}
-
static struct gss_cl_ctx *
gss_cred_get_ctx(struct rpc_cred *cred)
{
diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h
new file mode 100644
index 000000000000..f6d9631bd9d0
--- /dev/null
+++ b/net/sunrpc/auth_gss/auth_gss_internal.h
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ * linux/net/sunrpc/auth_gss/auth_gss_internal.h
+ *
+ * Internal definitions for RPCSEC_GSS client authentication
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ */
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/sunrpc/xdr.h>
+
+static inline const void *
+simple_get_bytes(const void *p, const void *end, void *res, size_t len)
+{
+ const void *q = (const void *)((const char *)p + len);
+ if (unlikely(q > end || q < p))
+ return ERR_PTR(-EFAULT);
+ memcpy(res, p, len);
+ return q;
+}
+
+static inline const void *
+simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
+{
+ const void *q;
+ unsigned int len;
+
+ p = simple_get_bytes(p, end, &len, sizeof(len));
+ if (IS_ERR(p))
+ return p;
+ q = (const void *)((const char *)p + len);
+ if (unlikely(q > end || q < p))
+ return ERR_PTR(-EFAULT);
+ if (len) {
+ dest->data = kmemdup(p, len, GFP_NOFS);
+ if (unlikely(dest->data == NULL))
+ return ERR_PTR(-ENOMEM);
+ } else
+ dest->data = NULL;
+ dest->len = len;
+ return q;
+}
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 794fb3001880..634b6c6e0dcb 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -138,135 +138,6 @@ checksummer(struct scatterlist *sg, void *data)
return crypto_ahash_update(req);
}
-static int
-arcfour_hmac_md5_usage_to_salt(unsigned int usage, u8 salt[4])
-{
- unsigned int ms_usage;
-
- switch (usage) {
- case KG_USAGE_SIGN:
- ms_usage = 15;
- break;
- case KG_USAGE_SEAL:
- ms_usage = 13;
- break;
- default:
- return -EINVAL;
- }
- salt[0] = (ms_usage >> 0) & 0xff;
- salt[1] = (ms_usage >> 8) & 0xff;
- salt[2] = (ms_usage >> 16) & 0xff;
- salt[3] = (ms_usage >> 24) & 0xff;
-
- return 0;
-}
-
-static u32
-make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
- struct xdr_buf *body, int body_offset, u8 *cksumkey,
- unsigned int usage, struct xdr_netobj *cksumout)
-{
- struct scatterlist sg[1];
- int err = -1;
- u8 *checksumdata;
- u8 *rc4salt;
- struct crypto_ahash *md5;
- struct crypto_ahash *hmac_md5;
- struct ahash_request *req;
-
- if (cksumkey == NULL)
- return GSS_S_FAILURE;
-
- if (cksumout->len < kctx->gk5e->cksumlength) {
- dprintk("%s: checksum buffer length, %u, too small for %s\n",
- __func__, cksumout->len, kctx->gk5e->name);
- return GSS_S_FAILURE;
- }
-
- rc4salt = kmalloc_array(4, sizeof(*rc4salt), GFP_NOFS);
- if (!rc4salt)
- return GSS_S_FAILURE;
-
- if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) {
- dprintk("%s: invalid usage value %u\n", __func__, usage);
- goto out_free_rc4salt;
- }
-
- checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
- if (!checksumdata)
- goto out_free_rc4salt;
-
- md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(md5))
- goto out_free_cksum;
-
- hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(hmac_md5))
- goto out_free_md5;
-
- req = ahash_request_alloc(md5, GFP_NOFS);
- if (!req)
- goto out_free_hmac_md5;
-
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
-
- err = crypto_ahash_init(req);
- if (err)
- goto out;
- sg_init_one(sg, rc4salt, 4);
- ahash_request_set_crypt(req, sg, NULL, 4);
- err = crypto_ahash_update(req);
- if (err)
- goto out;
-
- sg_init_one(sg, header, hdrlen);
- ahash_request_set_crypt(req, sg, NULL, hdrlen);
- err = crypto_ahash_update(req);
- if (err)
- goto out;
- err = xdr_process_buf(body, body_offset, body->len - body_offset,
- checksummer, req);
- if (err)
- goto out;
- ahash_request_set_crypt(req, NULL, checksumdata, 0);
- err = crypto_ahash_final(req);
- if (err)
- goto out;
-
- ahash_request_free(req);
- req = ahash_request_alloc(hmac_md5, GFP_NOFS);
- if (!req)
- goto out_free_hmac_md5;
-
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
-
- err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
- if (err)
- goto out;
-
- sg_init_one(sg, checksumdata, crypto_ahash_digestsize(md5));
- ahash_request_set_crypt(req, sg, checksumdata,
- crypto_ahash_digestsize(md5));
- err = crypto_ahash_digest(req);
- if (err)
- goto out;
-
- memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
- cksumout->len = kctx->gk5e->cksumlength;
-out:
- ahash_request_free(req);
-out_free_hmac_md5:
- crypto_free_ahash(hmac_md5);
-out_free_md5:
- crypto_free_ahash(md5);
-out_free_cksum:
- kfree(checksumdata);
-out_free_rc4salt:
- kfree(rc4salt);
- return err ? GSS_S_FAILURE : 0;
-}
-
/*
* checksum the plaintext data and hdrlen bytes of the token header
* The checksum is performed over the first 8 bytes of the
@@ -284,11 +155,6 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
u8 *checksumdata;
unsigned int checksumlen;
- if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR)
- return make_checksum_hmac_md5(kctx, header, hdrlen,
- body, body_offset,
- cksumkey, usage, cksumout);
-
if (cksumout->len < kctx->gk5e->cksumlength) {
dprintk("%s: checksum buffer length, %u, too small for %s\n",
__func__, cksumout->len, kctx->gk5e->name);
@@ -942,145 +808,3 @@ out_err:
ret = GSS_S_FAILURE;
return ret;
}
-
-/*
- * Compute Kseq given the initial session key and the checksum.
- * Set the key of the given cipher.
- */
-int
-krb5_rc4_setup_seq_key(struct krb5_ctx *kctx,
- struct crypto_sync_skcipher *cipher,
- unsigned char *cksum)
-{
- struct crypto_shash *hmac;
- struct shash_desc *desc;
- u8 Kseq[GSS_KRB5_MAX_KEYLEN];
- u32 zeroconstant = 0;
- int err;
-
- dprintk("%s: entered\n", __func__);
-
- hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0);
- if (IS_ERR(hmac)) {
- dprintk("%s: error %ld, allocating hash '%s'\n",
- __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
- return PTR_ERR(hmac);
- }
-
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_NOFS);
- if (!desc) {
- dprintk("%s: failed to allocate shash descriptor for '%s'\n",
- __func__, kctx->gk5e->cksum_name);
- crypto_free_shash(hmac);
- return -ENOMEM;
- }
-
- desc->tfm = hmac;
-
- /* Compute intermediate Kseq from session key */
- err = crypto_shash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = crypto_shash_digest(desc, (u8 *)&zeroconstant, 4, Kseq);
- if (err)
- goto out_err;
-
- /* Compute final Kseq from the checksum and intermediate Kseq */
- err = crypto_shash_setkey(hmac, Kseq, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = crypto_shash_digest(desc, cksum, 8, Kseq);
- if (err)
- goto out_err;
-
- err = crypto_sync_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = 0;
-
-out_err:
- kfree_sensitive(desc);
- crypto_free_shash(hmac);
- dprintk("%s: returning %d\n", __func__, err);
- return err;
-}
-
-/*
- * Compute Kcrypt given the initial session key and the plaintext seqnum.
- * Set the key of cipher kctx->enc.
- */
-int
-krb5_rc4_setup_enc_key(struct krb5_ctx *kctx,
- struct crypto_sync_skcipher *cipher,
- s32 seqnum)
-{
- struct crypto_shash *hmac;
- struct shash_desc *desc;
- u8 Kcrypt[GSS_KRB5_MAX_KEYLEN];
- u8 zeroconstant[4] = {0};
- u8 seqnumarray[4];
- int err, i;
-
- dprintk("%s: entered, seqnum %u\n", __func__, seqnum);
-
- hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0);
- if (IS_ERR(hmac)) {
- dprintk("%s: error %ld, allocating hash '%s'\n",
- __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
- return PTR_ERR(hmac);
- }
-
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_NOFS);
- if (!desc) {
- dprintk("%s: failed to allocate shash descriptor for '%s'\n",
- __func__, kctx->gk5e->cksum_name);
- crypto_free_shash(hmac);
- return -ENOMEM;
- }
-
- desc->tfm = hmac;
-
- /* Compute intermediate Kcrypt from session key */
- for (i = 0; i < kctx->gk5e->keylength; i++)
- Kcrypt[i] = kctx->Ksess[i] ^ 0xf0;
-
- err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = crypto_shash_digest(desc, zeroconstant, 4, Kcrypt);
- if (err)
- goto out_err;
-
- /* Compute final Kcrypt from the seqnum and intermediate Kcrypt */
- err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- seqnumarray[0] = (unsigned char) ((seqnum >> 24) & 0xff);
- seqnumarray[1] = (unsigned char) ((seqnum >> 16) & 0xff);
- seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff);
- seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff);
-
- err = crypto_shash_digest(desc, seqnumarray, 4, Kcrypt);
- if (err)
- goto out_err;
-
- err = crypto_sync_skcipher_setkey(cipher, Kcrypt,
- kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = 0;
-
-out_err:
- kfree_sensitive(desc);
- crypto_free_shash(hmac);
- dprintk("%s: returning %d\n", __func__, err);
- return err;
-}
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index a84a5b289484..1c092b05c2bb 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -21,6 +21,8 @@
#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/gss_krb5_enctypes.h>
+#include "auth_gss_internal.h"
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
@@ -52,27 +54,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = {
},
#endif /* CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES */
/*
- * RC4-HMAC
- */
- {
- .etype = ENCTYPE_ARCFOUR_HMAC,
- .ctype = CKSUMTYPE_HMAC_MD5_ARCFOUR,
- .name = "rc4-hmac",
- .encrypt_name = "ecb(arc4)",
- .cksum_name = "hmac(md5)",
- .encrypt = krb5_encrypt,
- .decrypt = krb5_decrypt,
- .mk_key = NULL,
- .signalg = SGN_ALG_HMAC_MD5,
- .sealalg = SEAL_ALG_MICROSOFT_RC4,
- .keybytes = 16,
- .keylength = 16,
- .blocksize = 1,
- .conflen = 8,
- .cksumlength = 8,
- .keyed_cksum = 1,
- },
- /*
* 3DES
*/
{
@@ -164,35 +145,6 @@ get_gss_krb5_enctype(int etype)
return NULL;
}
-static const void *
-simple_get_bytes(const void *p, const void *end, void *res, int len)
-{
- const void *q = (const void *)((const char *)p + len);
- if (unlikely(q > end || q < p))
- return ERR_PTR(-EFAULT);
- memcpy(res, p, len);
- return q;
-}
-
-static const void *
-simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res)
-{
- const void *q;
- unsigned int len;
-
- p = simple_get_bytes(p, end, &len, sizeof(len));
- if (IS_ERR(p))
- return p;
- q = (const void *)((const char *)p + len);
- if (unlikely(q > end || q < p))
- return ERR_PTR(-EFAULT);
- res->data = kmemdup(p, len, GFP_NOFS);
- if (unlikely(res->data == NULL))
- return ERR_PTR(-ENOMEM);
- res->len = len;
- return q;
-}
-
static inline const void *
get_key(const void *p, const void *end,
struct krb5_ctx *ctx, struct crypto_sync_skcipher **res)
@@ -401,78 +353,6 @@ out_err:
return -EINVAL;
}
-/*
- * Note that RC4 depends on deriving keys using the sequence
- * number or the checksum of a token. Therefore, the final keys
- * cannot be calculated until the token is being constructed!
- */
-static int
-context_derive_keys_rc4(struct krb5_ctx *ctx)
-{
- struct crypto_shash *hmac;
- char sigkeyconstant[] = "signaturekey";
- int slen = strlen(sigkeyconstant) + 1; /* include null terminator */
- struct shash_desc *desc;
- int err;
-
- dprintk("RPC: %s: entered\n", __func__);
- /*
- * derive cksum (aka Ksign) key
- */
- hmac = crypto_alloc_shash(ctx->gk5e->cksum_name, 0, 0);
- if (IS_ERR(hmac)) {
- dprintk("%s: error %ld allocating hash '%s'\n",
- __func__, PTR_ERR(hmac), ctx->gk5e->cksum_name);
- err = PTR_ERR(hmac);
- goto out_err;
- }
-
- err = crypto_shash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength);
- if (err)
- goto out_err_free_hmac;
-
-
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), GFP_NOFS);
- if (!desc) {
- dprintk("%s: failed to allocate hash descriptor for '%s'\n",
- __func__, ctx->gk5e->cksum_name);
- err = -ENOMEM;
- goto out_err_free_hmac;
- }
-
- desc->tfm = hmac;
-
- err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum);
- kfree_sensitive(desc);
- if (err)
- goto out_err_free_hmac;
- /*
- * allocate hash, and skciphers for data and seqnum encryption
- */
- ctx->enc = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(ctx->enc)) {
- err = PTR_ERR(ctx->enc);
- goto out_err_free_hmac;
- }
-
- ctx->seq = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(ctx->seq)) {
- crypto_free_sync_skcipher(ctx->enc);
- err = PTR_ERR(ctx->seq);
- goto out_err_free_hmac;
- }
-
- dprintk("RPC: %s: returning success\n", __func__);
-
- err = 0;
-
-out_err_free_hmac:
- crypto_free_shash(hmac);
-out_err:
- dprintk("RPC: %s: returning %d\n", __func__, err);
- return err;
-}
-
static int
context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
{
@@ -649,8 +529,6 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
switch (ctx->enctype) {
case ENCTYPE_DES3_CBC_RAW:
return context_derive_keys_des3(ctx, gfp_mask);
- case ENCTYPE_ARCFOUR_HMAC:
- return context_derive_keys_rc4(ctx);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
return context_derive_keys_new(ctx, gfp_mask);
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index f1d280accf43..33061417ec97 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -214,7 +214,6 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_get_mic_v1(ctx, text, token);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index 507105127095..fb117817ff5d 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -39,42 +39,6 @@
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
-static s32
-krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
- unsigned char *cksum, unsigned char *buf)
-{
- struct crypto_sync_skcipher *cipher;
- unsigned char *plain;
- s32 code;
-
- dprintk("RPC: %s:\n", __func__);
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(cipher))
- return PTR_ERR(cipher);
-
- plain = kmalloc(8, GFP_NOFS);
- if (!plain)
- return -ENOMEM;
-
- plain[0] = (unsigned char) ((seqnum >> 24) & 0xff);
- plain[1] = (unsigned char) ((seqnum >> 16) & 0xff);
- plain[2] = (unsigned char) ((seqnum >> 8) & 0xff);
- plain[3] = (unsigned char) ((seqnum >> 0) & 0xff);
- plain[4] = direction;
- plain[5] = direction;
- plain[6] = direction;
- plain[7] = direction;
-
- code = krb5_rc4_setup_seq_key(kctx, cipher, cksum);
- if (code)
- goto out;
-
- code = krb5_encrypt(cipher, cksum, plain, buf, 8);
-out:
- kfree(plain);
- crypto_free_sync_skcipher(cipher);
- return code;
-}
s32
krb5_make_seq_num(struct krb5_ctx *kctx,
struct crypto_sync_skcipher *key,
@@ -85,10 +49,6 @@ krb5_make_seq_num(struct krb5_ctx *kctx,
unsigned char *plain;
s32 code;
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
- return krb5_make_rc4_seq_num(kctx, direction, seqnum,
- cksum, buf);
-
plain = kmalloc(8, GFP_NOFS);
if (!plain)
return -ENOMEM;
@@ -108,50 +68,6 @@ krb5_make_seq_num(struct krb5_ctx *kctx,
return code;
}
-static s32
-krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
- unsigned char *buf, int *direction, s32 *seqnum)
-{
- struct crypto_sync_skcipher *cipher;
- unsigned char *plain;
- s32 code;
-
- dprintk("RPC: %s:\n", __func__);
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(cipher))
- return PTR_ERR(cipher);
-
- code = krb5_rc4_setup_seq_key(kctx, cipher, cksum);
- if (code)
- goto out;
-
- plain = kmalloc(8, GFP_NOFS);
- if (!plain) {
- code = -ENOMEM;
- goto out;
- }
-
- code = krb5_decrypt(cipher, cksum, buf, plain, 8);
- if (code)
- goto out_plain;
-
- if ((plain[4] != plain[5]) || (plain[4] != plain[6])
- || (plain[4] != plain[7])) {
- code = (s32)KG_BAD_SEQ;
- goto out_plain;
- }
-
- *direction = plain[4];
-
- *seqnum = ((plain[0] << 24) | (plain[1] << 16) |
- (plain[2] << 8) | (plain[3]));
-out_plain:
- kfree(plain);
-out:
- crypto_free_sync_skcipher(cipher);
- return code;
-}
-
s32
krb5_get_seq_num(struct krb5_ctx *kctx,
unsigned char *cksum,
@@ -164,9 +80,6 @@ krb5_get_seq_num(struct krb5_ctx *kctx,
dprintk("RPC: krb5_get_seq_num:\n");
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
- return krb5_get_rc4_seq_num(kctx, cksum, buf,
- direction, seqnum);
plain = kmalloc(8, GFP_NOFS);
if (!plain)
return -ENOMEM;
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index aaab91cf24c8..ba04e3ec970a 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -218,7 +218,6 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_verify_mic_v1(ctx, message_buffer, read_token);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 8b300b74a722..e95c009bb869 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -236,26 +236,9 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)))
return GSS_S_FAILURE;
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
- struct crypto_sync_skcipher *cipher;
- int err;
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name,
- 0, 0);
- if (IS_ERR(cipher))
- return GSS_S_FAILURE;
-
- krb5_rc4_setup_enc_key(kctx, cipher, seq_send);
-
- err = gss_encrypt_xdr_buf(cipher, buf,
- offset + headlen - conflen, pages);
- crypto_free_sync_skcipher(cipher);
- if (err)
- return GSS_S_FAILURE;
- } else {
- if (gss_encrypt_xdr_buf(kctx->enc, buf,
- offset + headlen - conflen, pages))
- return GSS_S_FAILURE;
- }
+ if (gss_encrypt_xdr_buf(kctx->enc, buf,
+ offset + headlen - conflen, pages))
+ return GSS_S_FAILURE;
return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
}
@@ -316,37 +299,9 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, int len,
crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) -
(unsigned char *)buf->head[0].iov_base;
- /*
- * Need plaintext seqnum to derive encryption key for arcfour-hmac
- */
- if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN,
- ptr + 8, &direction, &seqnum))
- return GSS_S_BAD_SIG;
-
- if ((kctx->initiate && direction != 0xff) ||
- (!kctx->initiate && direction != 0))
- return GSS_S_BAD_SIG;
-
buf->len = len;
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
- struct crypto_sync_skcipher *cipher;
- int err;
-
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name,
- 0, 0);
- if (IS_ERR(cipher))
- return GSS_S_FAILURE;
-
- krb5_rc4_setup_enc_key(kctx, cipher, seqnum);
-
- err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset);
- crypto_free_sync_skcipher(cipher);
- if (err)
- return GSS_S_DEFECTIVE_TOKEN;
- } else {
- if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset))
- return GSS_S_DEFECTIVE_TOKEN;
- }
+ if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset))
+ return GSS_S_DEFECTIVE_TOKEN;
if (kctx->gk5e->keyed_cksum)
cksumkey = kctx->cksum;
@@ -370,6 +325,14 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, int len,
/* do sequencing checks */
+ if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN,
+ ptr + 8, &direction, &seqnum))
+ return GSS_S_BAD_SIG;
+
+ if ((kctx->initiate && direction != 0xff) ||
+ (!kctx->initiate && direction != 0))
+ return GSS_S_BAD_SIG;
+
/* Copy the data back to the right position. XXX: Would probably be
* better to copy and encrypt at the same time. */
@@ -605,7 +568,6 @@ gss_wrap_kerberos(struct gss_ctx *gctx, int offset,
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_wrap_kerberos_v1(kctx, offset, buf, pages);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
@@ -624,7 +586,6 @@ gss_unwrap_kerberos(struct gss_ctx *gctx, int offset,
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_unwrap_kerberos_v1(kctx, offset, len, buf,
&gctx->slack, &gctx->align);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index af9c7f43859c..d1c003a25b0f 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -200,7 +200,7 @@ static int gssp_call(struct net *net, struct rpc_message *msg)
static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg)
{
- int i;
+ unsigned int i;
for (i = 0; i < arg->npages && arg->pages[i]; i++)
__free_page(arg->pages[i]);
@@ -210,14 +210,19 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg)
static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg)
{
+ unsigned int i;
+
arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE);
arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL);
- /*
- * XXX: actual pages are allocated by xdr layer in
- * xdr_partial_copy_from_skb.
- */
if (!arg->pages)
return -ENOMEM;
+ for (i = 0; i < arg->npages; i++) {
+ arg->pages[i] = alloc_page(GFP_KERNEL);
+ if (!arg->pages[i]) {
+ gssp_free_receive_pages(arg);
+ return -ENOMEM;
+ }
+ }
return 0;
}
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 2ff7b7083eba..d79f12c2550a 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -771,7 +771,6 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req,
xdr_inline_pages(&req->rq_rcv_buf,
PAGE_SIZE/2 /* pretty arbitrary */,
arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE);
- req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
done:
if (err)
dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err);
@@ -789,7 +788,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
scratch = alloc_page(GFP_KERNEL);
if (!scratch)
return -ENOMEM;
- xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE);
+ xdr_set_scratch_page(xdr, scratch);
/* res->status */
err = gssx_dec_status(xdr, &res->status);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 258b04372f85..bd4678db9d76 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1147,9 +1147,9 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp,
struct gssp_in_token *in_token)
{
struct kvec *argv = &rqstp->rq_arg.head[0];
- unsigned int page_base, length;
- int pages, i, res;
- size_t inlen;
+ unsigned int length, pgto_offs, pgfrom_offs;
+ int pages, i, res, pgto, pgfrom;
+ size_t inlen, to_offs, from_offs;
res = gss_read_common_verf(gc, argv, authp, in_handle);
if (res)
@@ -1177,17 +1177,24 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp,
memcpy(page_address(in_token->pages[0]), argv->iov_base, length);
inlen -= length;
- i = 1;
- page_base = rqstp->rq_arg.page_base;
+ to_offs = length;
+ from_offs = rqstp->rq_arg.page_base;
while (inlen) {
- length = min_t(unsigned int, inlen, PAGE_SIZE);
- memcpy(page_address(in_token->pages[i]),
- page_address(rqstp->rq_arg.pages[i]) + page_base,
+ pgto = to_offs >> PAGE_SHIFT;
+ pgfrom = from_offs >> PAGE_SHIFT;
+ pgto_offs = to_offs & ~PAGE_MASK;
+ pgfrom_offs = from_offs & ~PAGE_MASK;
+
+ length = min_t(unsigned int, inlen,
+ min_t(unsigned int, PAGE_SIZE - pgto_offs,
+ PAGE_SIZE - pgfrom_offs));
+ memcpy(page_address(in_token->pages[pgto]) + pgto_offs,
+ page_address(rqstp->rq_arg.pages[pgfrom]) + pgfrom_offs,
length);
+ to_offs += length;
+ from_offs += length;
inlen -= length;
- page_base = 0;
- i++;
}
return 0;
}
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 195b40c5dae4..22a2c235abf1 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -5,7 +5,7 @@
NetApp provides this source code under the GPL v2 License.
The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
+https://opensource.org/licenses/gpl-license.php.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -111,7 +111,7 @@ out_free:
* by the backchannel. This function can be called multiple times
* when creating new sessions that use the same rpc_xprt. The
* preallocated buffers are added to the pool of resources used by
- * the rpc_xprt. Anyone of these resources may be used used by an
+ * the rpc_xprt. Any one of these resources may be used by an
* incoming callback request. It's up to the higher levels in the
* stack to enforce that the maximum number of session slots is not
* being exceeded.
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index baef5ee43dbb..1a2c1c44bb00 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -498,16 +498,17 @@ static int cache_clean(void)
*/
static void do_cache_clean(struct work_struct *work)
{
- int delay = 5;
- if (cache_clean() == -1)
- delay = round_jiffies_relative(30*HZ);
+ int delay;
if (list_empty(&cache_list))
- delay = 0;
+ return;
+
+ if (cache_clean() == -1)
+ delay = round_jiffies_relative(30*HZ);
+ else
+ delay = 5;
- if (delay)
- queue_delayed_work(system_power_efficient_wq,
- &cache_cleaner, delay);
+ queue_delayed_work(system_power_efficient_wq, &cache_cleaner, delay);
}
@@ -777,7 +778,6 @@ void cache_clean_deferred(void *owner)
*/
static DEFINE_SPINLOCK(queue_lock);
-static DEFINE_MUTEX(queue_io_mutex);
struct cache_queue {
struct list_head list;
@@ -905,44 +905,26 @@ static ssize_t cache_do_downcall(char *kaddr, const char __user *buf,
return ret;
}
-static ssize_t cache_slow_downcall(const char __user *buf,
- size_t count, struct cache_detail *cd)
-{
- static char write_buf[8192]; /* protected by queue_io_mutex */
- ssize_t ret = -EINVAL;
-
- if (count >= sizeof(write_buf))
- goto out;
- mutex_lock(&queue_io_mutex);
- ret = cache_do_downcall(write_buf, buf, count, cd);
- mutex_unlock(&queue_io_mutex);
-out:
- return ret;
-}
-
static ssize_t cache_downcall(struct address_space *mapping,
const char __user *buf,
size_t count, struct cache_detail *cd)
{
- struct page *page;
- char *kaddr;
+ char *write_buf;
ssize_t ret = -ENOMEM;
- if (count >= PAGE_SIZE)
- goto out_slow;
+ if (count >= 32768) { /* 32k is max userland buffer, lets check anyway */
+ ret = -EINVAL;
+ goto out;
+ }
- page = find_or_create_page(mapping, 0, GFP_KERNEL);
- if (!page)
- goto out_slow;
+ write_buf = kvmalloc(count + 1, GFP_KERNEL);
+ if (!write_buf)
+ goto out;
- kaddr = kmap(page);
- ret = cache_do_downcall(kaddr, buf, count, cd);
- kunmap(page);
- unlock_page(page);
- put_page(page);
+ ret = cache_do_downcall(write_buf, buf, count, cd);
+ kvfree(write_buf);
+out:
return ret;
-out_slow:
- return cache_slow_downcall(buf, count, cd);
}
static ssize_t cache_write(struct file *filp, const char __user *buf,
@@ -1436,10 +1418,10 @@ static int c_show(struct seq_file *m, void *p)
cache_get(cp);
if (cache_check(cd, cp, NULL))
/* cache_check does a cache_put on failure */
- seq_printf(m, "# ");
+ seq_puts(m, "# ");
else {
if (cache_is_expired(cd, cp))
- seq_printf(m, "# ");
+ seq_puts(m, "# ");
cache_put(cp, cd);
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 62e0b6c1e8cf..612f0a641f4c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -47,10 +47,6 @@
# define RPCDBG_FACILITY RPCDBG_CALL
#endif
-#define dprint_status(t) \
- dprintk("RPC: %5u %s (status %d)\n", t->tk_pid, \
- __func__, t->tk_status)
-
/*
* All RPC clients are linked into this list
*/
@@ -1255,10 +1251,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages,
unsigned int base, unsigned int len,
unsigned int hdrsize)
{
- /* Subtract one to force an extra word of buffer space for the
- * payload's XDR pad to fall into the rcv_buf's tail iovec.
- */
- hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1;
+ hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign;
xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len);
trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf);
@@ -1639,10 +1632,6 @@ call_start(struct rpc_task *task)
int idx = task->tk_msg.rpc_proc->p_statidx;
trace_rpc_request(task);
- dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid,
- clnt->cl_program->name, clnt->cl_vers,
- rpc_proc_name(task),
- (RPC_IS_ASYNC(task) ? "async" : "sync"));
/* Increment call count (version might not be valid for ping) */
if (clnt->cl_program->version[clnt->cl_vers])
@@ -1658,8 +1647,6 @@ call_start(struct rpc_task *task)
static void
call_reserve(struct rpc_task *task)
{
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_reserveresult;
xprt_reserve(task);
@@ -1675,8 +1662,6 @@ call_reserveresult(struct rpc_task *task)
{
int status = task->tk_status;
- dprint_status(task);
-
/*
* After a call to xprt_reserve(), we must have either
* a request slot or else an error status.
@@ -1717,8 +1702,6 @@ call_reserveresult(struct rpc_task *task)
static void
call_retry_reserve(struct rpc_task *task)
{
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_reserveresult;
xprt_retry_reserve(task);
@@ -1730,8 +1713,6 @@ call_retry_reserve(struct rpc_task *task)
static void
call_refresh(struct rpc_task *task)
{
- dprint_status(task);
-
task->tk_action = call_refreshresult;
task->tk_status = 0;
task->tk_client->cl_stats->rpcauthrefresh++;
@@ -1746,8 +1727,6 @@ call_refreshresult(struct rpc_task *task)
{
int status = task->tk_status;
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_refresh;
switch (status) {
@@ -1770,12 +1749,10 @@ call_refreshresult(struct rpc_task *task)
if (!task->tk_cred_retry)
break;
task->tk_cred_retry--;
- dprintk("RPC: %5u %s: retry refresh creds\n",
- task->tk_pid, __func__);
+ trace_rpc_retry_refresh_status(task);
return;
}
- dprintk("RPC: %5u %s: refresh creds failed with error %d\n",
- task->tk_pid, __func__, status);
+ trace_rpc_refresh_status(task);
rpc_call_rpcerror(task, status);
}
@@ -1792,8 +1769,6 @@ call_allocate(struct rpc_task *task)
const struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
int status;
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_encode;
@@ -1823,6 +1798,7 @@ call_allocate(struct rpc_task *task)
req->rq_rcvsize <<= 2;
status = xprt->ops->buf_alloc(task);
+ trace_rpc_buf_alloc(task, status);
xprt_inject_disconnect(xprt);
if (status == 0)
return;
@@ -1831,8 +1807,6 @@ call_allocate(struct rpc_task *task)
return;
}
- dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
-
if (RPC_IS_ASYNC(task) || !fatal_signal_pending(current)) {
task->tk_action = call_allocate;
rpc_delay(task, HZ>>4);
@@ -1883,7 +1857,7 @@ call_encode(struct rpc_task *task)
{
if (!rpc_task_need_encode(task))
goto out;
- dprint_status(task);
+
/* Dequeue task from the receive queue while we're encoding */
xprt_request_dequeue_xprt(task);
/* Encode here so that rpcsec_gss can use correct sequence number. */
@@ -1902,8 +1876,7 @@ call_encode(struct rpc_task *task)
} else {
task->tk_action = call_refresh;
task->tk_cred_retry--;
- dprintk("RPC: %5u %s: retry refresh creds\n",
- task->tk_pid, __func__);
+ trace_rpc_retry_refresh_status(task);
}
break;
default:
@@ -1960,8 +1933,6 @@ call_bind(struct rpc_task *task)
return;
}
- dprint_status(task);
-
task->tk_action = call_bind_status;
if (!xprt_prepare_transmit(task))
return;
@@ -1983,8 +1954,6 @@ call_bind_status(struct rpc_task *task)
return;
}
- dprint_status(task);
- trace_rpc_bind_status(task);
if (task->tk_status >= 0)
goto out_next;
if (xprt_bound(xprt)) {
@@ -1994,12 +1963,10 @@ call_bind_status(struct rpc_task *task)
switch (task->tk_status) {
case -ENOMEM:
- dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid);
rpc_delay(task, HZ >> 2);
goto retry_timeout;
case -EACCES:
- dprintk("RPC: %5u remote rpcbind: RPC program/version "
- "unavailable\n", task->tk_pid);
+ trace_rpcb_prog_unavail_err(task);
/* fail immediately if this is an RPC ping */
if (task->tk_msg.rpc_proc->p_proc == 0) {
status = -EOPNOTSUPP;
@@ -2016,17 +1983,14 @@ call_bind_status(struct rpc_task *task)
case -EAGAIN:
goto retry_timeout;
case -ETIMEDOUT:
- dprintk("RPC: %5u rpcbind request timed out\n",
- task->tk_pid);
+ trace_rpcb_timeout_err(task);
goto retry_timeout;
case -EPFNOSUPPORT:
/* server doesn't support any rpcbind version we know of */
- dprintk("RPC: %5u unrecognized remote rpcbind service\n",
- task->tk_pid);
+ trace_rpcb_bind_version_err(task);
break;
case -EPROTONOSUPPORT:
- dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n",
- task->tk_pid);
+ trace_rpcb_bind_version_err(task);
goto retry_timeout;
case -ECONNREFUSED: /* connection problems */
case -ECONNRESET:
@@ -2037,8 +2001,7 @@ call_bind_status(struct rpc_task *task)
case -EHOSTUNREACH:
case -ENETUNREACH:
case -EPIPE:
- dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
- task->tk_pid, task->tk_status);
+ trace_rpcb_unreachable_err(task);
if (!RPC_IS_SOFTCONN(task)) {
rpc_delay(task, 5*HZ);
goto retry_timeout;
@@ -2046,8 +2009,7 @@ call_bind_status(struct rpc_task *task)
status = task->tk_status;
break;
default:
- dprintk("RPC: %5u unrecognized rpcbind error (%d)\n",
- task->tk_pid, -task->tk_status);
+ trace_rpcb_unrecognized_err(task);
}
rpc_call_rpcerror(task, status);
@@ -2079,10 +2041,6 @@ call_connect(struct rpc_task *task)
return;
}
- dprintk("RPC: %5u call_connect xprt %p %s connected\n",
- task->tk_pid, xprt,
- (xprt_connected(xprt) ? "is" : "is not"));
-
task->tk_action = call_connect_status;
if (task->tk_status < 0)
return;
@@ -2110,7 +2068,6 @@ call_connect_status(struct rpc_task *task)
return;
}
- dprint_status(task);
trace_rpc_connect_status(task);
if (task->tk_status == 0) {
@@ -2178,8 +2135,6 @@ call_transmit(struct rpc_task *task)
return;
}
- dprint_status(task);
-
task->tk_action = call_transmit_status;
if (!xprt_prepare_transmit(task))
return;
@@ -2214,7 +2169,6 @@ call_transmit_status(struct rpc_task *task)
switch (task->tk_status) {
default:
- dprint_status(task);
break;
case -EBADMSG:
task->tk_status = 0;
@@ -2296,8 +2250,6 @@ call_bc_transmit_status(struct rpc_task *task)
if (rpc_task_transmitted(task))
task->tk_status = 0;
- dprint_status(task);
-
switch (task->tk_status) {
case 0:
/* Success */
@@ -2357,8 +2309,6 @@ call_status(struct rpc_task *task)
if (!task->tk_msg.rpc_proc->p_proc)
trace_xprt_ping(task->tk_xprt, task->tk_status);
- dprint_status(task);
-
status = task->tk_status;
if (status >= 0) {
task->tk_action = call_decode;
@@ -2405,7 +2355,8 @@ call_status(struct rpc_task *task)
goto out_exit;
}
task->tk_action = call_encode;
- rpc_check_timeout(task);
+ if (status != -ECONNRESET && status != -ECONNABORTED)
+ rpc_check_timeout(task);
return;
out_exit:
rpc_call_rpcerror(task, status);
@@ -2433,7 +2384,7 @@ rpc_check_timeout(struct rpc_task *task)
if (xprt_adjust_timeout(task->tk_rqstp) == 0)
return;
- dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid);
+ trace_rpc_timeout_status(task);
task->tk_timeouts++;
if (RPC_IS_SOFTCONN(task) && !rpc_check_connected(task->tk_rqstp)) {
@@ -2492,8 +2443,6 @@ call_decode(struct rpc_task *task)
struct xdr_stream xdr;
int err;
- dprint_status(task);
-
if (!task->tk_msg.rpc_proc->p_decode) {
task->tk_action = rpc_exit_task;
return;
@@ -2537,8 +2486,6 @@ out:
case 0:
task->tk_action = rpc_exit_task;
task->tk_status = rpcauth_unwrap_resp(task, &xdr);
- dprintk("RPC: %5u %s result %d\n",
- task->tk_pid, __func__, task->tk_status);
return;
case -EAGAIN:
task->tk_status = 0;
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index fd9bca242724..56029e3af6ff 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -128,13 +128,13 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n
return 0;
len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
xprt->debugfs->d_name.name);
- if (len > sizeof(name))
+ if (len >= sizeof(name))
return -1;
if (*nump == 0)
strcpy(link, "xprt");
else {
len = snprintf(link, sizeof(link), "xprt%d", *nump);
- if (len > sizeof(link))
+ if (len >= sizeof(link))
return -1;
}
debugfs_create_symlink(link, clnt->cl_debugfs, name);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index eadc0ede928c..8241f5a4a01c 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -781,7 +781,8 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
}
/**
- * rpc_mkpipe - make an rpc_pipefs file for kernel<->userspace communication
+ * rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace
+ * communication
* @parent: dentry of directory to create new "pipe" in
* @name: name of pipe
* @private: private data to associate with the pipe, for the caller's use
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 4a67685c83eb..38fe2ce8a5aa 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -31,11 +31,9 @@
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/xprtsock.h>
-#include "netns.h"
+#include <trace/events/sunrpc.h>
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_BIND
-#endif
+#include "netns.h"
#define RPCBIND_SOCK_PATHNAME "/var/run/rpcbind.sock"
@@ -216,10 +214,6 @@ static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt,
sn->rpcb_is_af_local = is_af_local ? 1 : 0;
smp_wmb();
sn->rpcb_users = 1;
- dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: "
- "%p, rpcb_local_clnt4: %p) for net %x%s\n",
- sn->rpcb_local_clnt, sn->rpcb_local_clnt4,
- net->ns.inum, (net == &init_net) ? " (init_net)" : "");
}
/*
@@ -261,19 +255,13 @@ static int rpcb_create_local_unix(struct net *net)
*/
clnt = rpc_create(&args);
if (IS_ERR(clnt)) {
- dprintk("RPC: failed to create AF_LOCAL rpcbind "
- "client (errno %ld).\n", PTR_ERR(clnt));
result = PTR_ERR(clnt);
goto out;
}
clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
- if (IS_ERR(clnt4)) {
- dprintk("RPC: failed to bind second program to "
- "rpcbind v4 client (errno %ld).\n",
- PTR_ERR(clnt4));
+ if (IS_ERR(clnt4))
clnt4 = NULL;
- }
rpcb_set_local(net, clnt, clnt4, true);
@@ -309,8 +297,6 @@ static int rpcb_create_local_net(struct net *net)
clnt = rpc_create(&args);
if (IS_ERR(clnt)) {
- dprintk("RPC: failed to create local rpcbind "
- "client (errno %ld).\n", PTR_ERR(clnt));
result = PTR_ERR(clnt);
goto out;
}
@@ -321,12 +307,8 @@ static int rpcb_create_local_net(struct net *net)
* v4 upcalls.
*/
clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
- if (IS_ERR(clnt4)) {
- dprintk("RPC: failed to bind second program to "
- "rpcbind v4 client (errno %ld).\n",
- PTR_ERR(clnt4));
+ if (IS_ERR(clnt4))
clnt4 = NULL;
- }
rpcb_set_local(net, clnt, clnt4, false);
@@ -403,11 +385,8 @@ static int rpcb_register_call(struct sunrpc_net *sn, struct rpc_clnt *clnt, stru
msg->rpc_resp = &result;
error = rpc_call_sync(clnt, msg, flags);
- if (error < 0) {
- dprintk("RPC: failed to contact local rpcbind "
- "server (errno %d).\n", -error);
+ if (error < 0)
return error;
- }
if (!result)
return -EACCES;
@@ -461,9 +440,7 @@ int rpcb_register(struct net *net, u32 prog, u32 vers, int prot, unsigned short
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
bool is_set = false;
- dprintk("RPC: %sregistering (%u, %u, %d, %u) with local "
- "rpcbind\n", (port ? "" : "un"),
- prog, vers, prot, port);
+ trace_pmap_register(prog, vers, prot, port);
msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET];
if (port != 0) {
@@ -489,11 +466,6 @@ static int rpcb_register_inet4(struct sunrpc_net *sn,
map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
- dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
- "local rpcbind\n", (port ? "" : "un"),
- map->r_prog, map->r_vers,
- map->r_addr, map->r_netid);
-
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
if (port != 0) {
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
@@ -520,11 +492,6 @@ static int rpcb_register_inet6(struct sunrpc_net *sn,
map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
- dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
- "local rpcbind\n", (port ? "" : "un"),
- map->r_prog, map->r_vers,
- map->r_addr, map->r_netid);
-
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
if (port != 0) {
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
@@ -541,9 +508,7 @@ static int rpcb_unregister_all_protofamilies(struct sunrpc_net *sn,
{
struct rpcbind_args *map = msg->rpc_argp;
- dprintk("RPC: unregistering [%u, %u, '%s'] with "
- "local rpcbind\n",
- map->r_prog, map->r_vers, map->r_netid);
+ trace_rpcb_unregister(map->r_prog, map->r_vers, map->r_netid);
map->r_addr = "";
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
@@ -615,6 +580,8 @@ int rpcb_v4_register(struct net *net, const u32 program, const u32 version,
if (address == NULL)
return rpcb_unregister_all_protofamilies(sn, &msg);
+ trace_rpcb_register(map.r_prog, map.r_vers, map.r_addr, map.r_netid);
+
switch (address->sa_family) {
case AF_INET:
return rpcb_register_inet4(sn, address, &msg);
@@ -693,18 +660,12 @@ void rpcb_getport_async(struct rpc_task *task)
rcu_read_unlock();
xprt = xprt_get(task->tk_xprt);
- dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
- task->tk_pid, __func__,
- xprt->servername, clnt->cl_prog, clnt->cl_vers, xprt->prot);
-
/* Put self on the wait queue to ensure we get notified if
* some other task is already attempting to bind the port */
rpc_sleep_on_timeout(&xprt->binding, task,
NULL, jiffies + xprt->bind_timeout);
if (xprt_test_and_set_binding(xprt)) {
- dprintk("RPC: %5u %s: waiting for another binder\n",
- task->tk_pid, __func__);
xprt_put(xprt);
return;
}
@@ -712,8 +673,6 @@ void rpcb_getport_async(struct rpc_task *task)
/* Someone else may have bound if we slept */
if (xprt_bound(xprt)) {
status = 0;
- dprintk("RPC: %5u %s: already bound\n",
- task->tk_pid, __func__);
goto bailout_nofree;
}
@@ -732,20 +691,15 @@ void rpcb_getport_async(struct rpc_task *task)
break;
default:
status = -EAFNOSUPPORT;
- dprintk("RPC: %5u %s: bad address family\n",
- task->tk_pid, __func__);
goto bailout_nofree;
}
if (proc == NULL) {
xprt->bind_index = 0;
status = -EPFNOSUPPORT;
- dprintk("RPC: %5u %s: no more getport versions available\n",
- task->tk_pid, __func__);
goto bailout_nofree;
}
- dprintk("RPC: %5u %s: trying rpcbind version %u\n",
- task->tk_pid, __func__, bind_version);
+ trace_rpcb_getport(clnt, task, bind_version);
rpcb_clnt = rpcb_create(xprt->xprt_net,
clnt->cl_nodename,
@@ -754,16 +708,12 @@ void rpcb_getport_async(struct rpc_task *task)
clnt->cl_cred);
if (IS_ERR(rpcb_clnt)) {
status = PTR_ERR(rpcb_clnt);
- dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
- task->tk_pid, __func__, PTR_ERR(rpcb_clnt));
goto bailout_nofree;
}
map = kzalloc(sizeof(struct rpcbind_args), GFP_NOFS);
if (!map) {
status = -ENOMEM;
- dprintk("RPC: %5u %s: no memory available\n",
- task->tk_pid, __func__);
goto bailout_release_client;
}
map->r_prog = clnt->cl_prog;
@@ -780,8 +730,6 @@ void rpcb_getport_async(struct rpc_task *task)
map->r_addr = rpc_sockaddr2uaddr(sap, GFP_NOFS);
if (!map->r_addr) {
status = -ENOMEM;
- dprintk("RPC: %5u %s: no memory available\n",
- task->tk_pid, __func__);
goto bailout_free_args;
}
map->r_owner = "";
@@ -818,34 +766,33 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
{
struct rpcbind_args *map = data;
struct rpc_xprt *xprt = map->r_xprt;
- int status = child->tk_status;
+
+ map->r_status = child->tk_status;
/* Garbage reply: retry with a lesser rpcbind version */
- if (status == -EIO)
- status = -EPROTONOSUPPORT;
+ if (map->r_status == -EIO)
+ map->r_status = -EPROTONOSUPPORT;
/* rpcbind server doesn't support this rpcbind protocol version */
- if (status == -EPROTONOSUPPORT)
+ if (map->r_status == -EPROTONOSUPPORT)
xprt->bind_index++;
- if (status < 0) {
+ if (map->r_status < 0) {
/* rpcbind server not available on remote host? */
- xprt->ops->set_port(xprt, 0);
+ map->r_port = 0;
+
} else if (map->r_port == 0) {
/* Requested RPC service wasn't registered on remote host */
- xprt->ops->set_port(xprt, 0);
- status = -EACCES;
+ map->r_status = -EACCES;
} else {
/* Succeeded */
- xprt->ops->set_port(xprt, map->r_port);
- xprt_set_bound(xprt);
- status = 0;
+ map->r_status = 0;
}
- dprintk("RPC: %5u rpcb_getport_done(status %d, port %u)\n",
- child->tk_pid, status, map->r_port);
-
- map->r_status = status;
+ trace_rpcb_setport(child, map->r_status, map->r_port);
+ xprt->ops->set_port(xprt, map->r_port);
+ if (map->r_port)
+ xprt_set_bound(xprt);
}
/*
@@ -858,11 +805,6 @@ static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr,
const struct rpcbind_args *rpcb = data;
__be32 *p;
- dprintk("RPC: %5u encoding PMAP_%s call (%u, %u, %d, %u)\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name,
- rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
-
p = xdr_reserve_space(xdr, RPCB_mappingargs_sz << 2);
*p++ = cpu_to_be32(rpcb->r_prog);
*p++ = cpu_to_be32(rpcb->r_vers);
@@ -884,8 +826,6 @@ static int rpcb_dec_getport(struct rpc_rqst *req, struct xdr_stream *xdr,
return -EIO;
port = be32_to_cpup(p);
- dprintk("RPC: %5u PMAP_%s result: %lu\n", req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name, port);
if (unlikely(port > USHRT_MAX))
return -EIO;
@@ -906,11 +846,6 @@ static int rpcb_dec_set(struct rpc_rqst *req, struct xdr_stream *xdr,
*boolp = 0;
if (*p != xdr_zero)
*boolp = 1;
-
- dprintk("RPC: %5u RPCB_%s call %s\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name,
- (*boolp ? "succeeded" : "failed"));
return 0;
}
@@ -935,12 +870,6 @@ static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
const struct rpcbind_args *rpcb = data;
__be32 *p;
- dprintk("RPC: %5u encoding RPCB_%s call (%u, %u, '%s', '%s')\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name,
- rpcb->r_prog, rpcb->r_vers,
- rpcb->r_netid, rpcb->r_addr);
-
p = xdr_reserve_space(xdr, (RPCB_program_sz + RPCB_version_sz) << 2);
*p++ = cpu_to_be32(rpcb->r_prog);
*p = cpu_to_be32(rpcb->r_vers);
@@ -970,11 +899,8 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
* If the returned universal address is a null string,
* the requested RPC service was not registered.
*/
- if (len == 0) {
- dprintk("RPC: %5u RPCB reply: program not registered\n",
- req->rq_task->tk_pid);
+ if (len == 0)
return 0;
- }
if (unlikely(len > RPCBIND_MAXUADDRLEN))
goto out_fail;
@@ -982,8 +908,6 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, len);
if (unlikely(p == NULL))
goto out_fail;
- dprintk("RPC: %5u RPCB_%s reply: %*pE\n", req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name, len, (char *)p);
if (rpc_uaddr2sockaddr(req->rq_xprt->xprt_net, (char *)p, len,
sap, sizeof(address)) == 0)
@@ -993,9 +917,6 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
return 0;
out_fail:
- dprintk("RPC: %5u malformed RPCB_%s reply\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name);
return -EIO;
}
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 7eba20a88438..cf702a5f7fe5 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -27,10 +27,6 @@
#include "sunrpc.h"
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-#define RPCDBG_FACILITY RPCDBG_SCHED
-#endif
-
#define CREATE_TRACE_POINTS
#include <trace/events/sunrpc.h>
@@ -85,7 +81,6 @@ __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
{
if (list_empty(&task->u.tk_wait.timer_list))
return;
- dprintk("RPC: %5u disabling timer\n", task->tk_pid);
task->tk_timeout = 0;
list_del(&task->u.tk_wait.timer_list);
if (list_empty(&queue->timer_list.list))
@@ -111,9 +106,6 @@ static void
__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task,
unsigned long timeout)
{
- dprintk("RPC: %5u setting alarm for %u ms\n",
- task->tk_pid, jiffies_to_msecs(timeout - jiffies));
-
task->tk_timeout = timeout;
if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires))
rpc_set_queue_timer(queue, timeout);
@@ -216,9 +208,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
/* barrier matches the read in rpc_wake_up_task_queue_locked() */
smp_wmb();
rpc_set_queued(task);
-
- dprintk("RPC: %5u added to queue %p \"%s\"\n",
- task->tk_pid, queue, rpc_qname(queue));
}
/*
@@ -241,8 +230,6 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas
else
list_del(&task->u.tk_wait.list);
queue->qlen--;
- dprintk("RPC: %5u removed from queue %p \"%s\"\n",
- task->tk_pid, queue, rpc_qname(queue));
}
static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues)
@@ -382,13 +369,9 @@ static void __rpc_do_sleep_on_priority(struct rpc_wait_queue *q,
struct rpc_task *task,
unsigned char queue_priority)
{
- dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
- task->tk_pid, rpc_qname(q), jiffies);
-
trace_rpc_task_sleep(task, q);
__rpc_add_wait_queue(q, task, queue_priority);
-
}
static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
@@ -510,9 +493,6 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
struct rpc_wait_queue *queue,
struct rpc_task *task)
{
- dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
- task->tk_pid, jiffies);
-
/* Has the task been executed yet? If not, we cannot wake it up! */
if (!RPC_IS_ACTIVATED(task)) {
printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task);
@@ -524,8 +504,6 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
__rpc_remove_wait_queue(queue, task);
rpc_make_runnable(wq, task);
-
- dprintk("RPC: __rpc_wake_up_task done\n");
}
/*
@@ -663,8 +641,6 @@ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
{
struct rpc_task *task = NULL;
- dprintk("RPC: wake_up_first(%p \"%s\")\n",
- queue, rpc_qname(queue));
spin_lock(&queue->lock);
task = __rpc_find_next_queued(queue);
if (task != NULL)
@@ -700,6 +676,23 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue)
EXPORT_SYMBOL_GPL(rpc_wake_up_next);
/**
+ * rpc_wake_up_locked - wake up all rpc_tasks
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ *
+ */
+static void rpc_wake_up_locked(struct rpc_wait_queue *queue)
+{
+ struct rpc_task *task;
+
+ for (;;) {
+ task = __rpc_find_next_queued(queue);
+ if (task == NULL)
+ break;
+ rpc_wake_up_task_queue_locked(queue, task);
+ }
+}
+
+/**
* rpc_wake_up - wake up all rpc_tasks
* @queue: rpc_wait_queue on which the tasks are sleeping
*
@@ -707,25 +700,28 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next);
*/
void rpc_wake_up(struct rpc_wait_queue *queue)
{
- struct list_head *head;
-
spin_lock(&queue->lock);
- head = &queue->tasks[queue->maxpriority];
+ rpc_wake_up_locked(queue);
+ spin_unlock(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up);
+
+/**
+ * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value.
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ * @status: status value to set
+ */
+static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status)
+{
+ struct rpc_task *task;
+
for (;;) {
- while (!list_empty(head)) {
- struct rpc_task *task;
- task = list_first_entry(head,
- struct rpc_task,
- u.tk_wait.list);
- rpc_wake_up_task_queue_locked(queue, task);
- }
- if (head == &queue->tasks[0])
+ task = __rpc_find_next_queued(queue);
+ if (task == NULL)
break;
- head--;
+ rpc_wake_up_task_queue_set_status_locked(queue, task, status);
}
- spin_unlock(&queue->lock);
}
-EXPORT_SYMBOL_GPL(rpc_wake_up);
/**
* rpc_wake_up_status - wake up all rpc_tasks and set their status value.
@@ -736,23 +732,8 @@ EXPORT_SYMBOL_GPL(rpc_wake_up);
*/
void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
{
- struct list_head *head;
-
spin_lock(&queue->lock);
- head = &queue->tasks[queue->maxpriority];
- for (;;) {
- while (!list_empty(head)) {
- struct rpc_task *task;
- task = list_first_entry(head,
- struct rpc_task,
- u.tk_wait.list);
- task->tk_status = status;
- rpc_wake_up_task_queue_locked(queue, task);
- }
- if (head == &queue->tasks[0])
- break;
- head--;
- }
+ rpc_wake_up_status_locked(queue, status);
spin_unlock(&queue->lock);
}
EXPORT_SYMBOL_GPL(rpc_wake_up_status);
@@ -770,7 +751,7 @@ static void __rpc_queue_timer_fn(struct work_struct *work)
list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
timeo = task->tk_timeout;
if (time_after_eq(now, timeo)) {
- dprintk("RPC: %5u timeout\n", task->tk_pid);
+ trace_rpc_task_timeout(task, task->tk_action);
task->tk_status = -ETIMEDOUT;
rpc_wake_up_task_queue_locked(queue, task);
continue;
@@ -885,9 +866,6 @@ static void __rpc_execute(struct rpc_task *task)
int task_is_async = RPC_IS_ASYNC(task);
int status = 0;
- dprintk("RPC: %5u __rpc_execute flags=0x%x\n",
- task->tk_pid, task->tk_flags);
-
WARN_ON_ONCE(RPC_IS_QUEUED(task));
if (RPC_IS_QUEUED(task))
return;
@@ -947,7 +925,7 @@ static void __rpc_execute(struct rpc_task *task)
return;
/* sync task: sleep here */
- dprintk("RPC: %5u sync task going to sleep\n", task->tk_pid);
+ trace_rpc_task_sync_sleep(task, task->tk_action);
status = out_of_line_wait_on_bit(&task->tk_runstate,
RPC_TASK_QUEUED, rpc_wait_bit_killable,
TASK_KILLABLE);
@@ -963,11 +941,9 @@ static void __rpc_execute(struct rpc_task *task)
task->tk_rpc_status = -ERESTARTSYS;
rpc_exit(task, -ERESTARTSYS);
}
- dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
+ trace_rpc_task_sync_wake(task, task->tk_action);
}
- dprintk("RPC: %5u return %d, status %d\n", task->tk_pid, status,
- task->tk_status);
/* Release all resources associated with the task */
rpc_release_task(task);
}
@@ -1036,8 +1012,6 @@ int rpc_malloc(struct rpc_task *task)
return -ENOMEM;
buf->len = size;
- dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
- task->tk_pid, size, buf);
rqst->rq_buffer = buf->data;
rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
return 0;
@@ -1058,9 +1032,6 @@ void rpc_free(struct rpc_task *task)
buf = container_of(buffer, struct rpc_buffer, data);
size = buf->len;
- dprintk("RPC: freeing buffer of size %zu at %p\n",
- size, buf);
-
if (size <= RPC_BUFFER_MAXSIZE)
mempool_free(buf, rpc_buffer_mempool);
else
@@ -1095,9 +1066,6 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
task->tk_action = rpc_prepare_task;
rpc_init_task_statistics(task);
-
- dprintk("RPC: new task initialized, procpid %u\n",
- task_pid_nr(current));
}
static struct rpc_task *
@@ -1121,7 +1089,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data)
rpc_init_task(task, setup_data);
task->tk_flags |= flags;
- dprintk("RPC: allocated task %p\n", task);
return task;
}
@@ -1151,10 +1118,8 @@ static void rpc_free_task(struct rpc_task *task)
put_rpccred(task->tk_op_cred);
rpc_release_calldata(task->tk_ops, task->tk_calldata);
- if (tk_flags & RPC_TASK_DYNAMIC) {
- dprintk("RPC: %5u freeing task\n", task->tk_pid);
+ if (tk_flags & RPC_TASK_DYNAMIC)
mempool_free(task, rpc_task_mempool);
- }
}
static void rpc_async_release(struct work_struct *work)
@@ -1208,8 +1173,6 @@ EXPORT_SYMBOL_GPL(rpc_put_task_async);
static void rpc_release_task(struct rpc_task *task)
{
- dprintk("RPC: %5u release task\n", task->tk_pid);
-
WARN_ON_ONCE(RPC_IS_QUEUED(task));
rpc_release_resources_task(task);
@@ -1250,7 +1213,6 @@ static int rpciod_start(void)
/*
* Create the rpciod thread and wait for it to start.
*/
- dprintk("RPC: creating workqueue rpciod\n");
wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
if (!wq)
goto out_failed;
@@ -1275,7 +1237,6 @@ static void rpciod_stop(void)
if (rpciod_workqueue == NULL)
return;
- dprintk("RPC: destroying workqueue rpciod\n");
wq = rpciod_workqueue;
rpciod_workqueue = NULL;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 3fc8af8bb961..d52313af82bc 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -70,7 +70,7 @@ static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to,
if (len > desc->count)
len = desc->count;
pos = desc->offset;
- csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0);
+ csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len);
desc->csum = csum_block_add(desc->csum, csum2, pos);
desc->count -= len;
desc->offset += len;
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index f6fe2e6cd65a..2f59464e6524 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -4,7 +4,7 @@
NetApp provides this source code under the GPL v2 License.
The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
+https://opensource.org/licenses/gpl-license.php.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c211b607239e..4187745887f0 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -614,6 +614,10 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
rqstp->rq_server = serv;
rqstp->rq_pool = pool;
+ rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0);
+ if (!rqstp->rq_scratch_page)
+ goto out_enomem;
+
rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
if (!rqstp->rq_argp)
goto out_enomem;
@@ -842,6 +846,7 @@ void
svc_rqst_free(struct svc_rqst *rqstp)
{
svc_release_buffer(rqstp);
+ put_page(rqstp->rq_scratch_page);
kfree(rqstp->rq_resp);
kfree(rqstp->rq_argp);
kfree(rqstp->rq_auth_data);
@@ -1622,7 +1627,7 @@ u32 svc_max_payload(const struct svc_rqst *rqstp)
EXPORT_SYMBOL_GPL(svc_max_payload);
/**
- * svc_encode_read_payload - mark a range of bytes as a READ payload
+ * svc_encode_result_payload - mark a range of bytes as a result payload
* @rqstp: svc_rqst to operate on
* @offset: payload's byte offset in rqstp->rq_res
* @length: size of payload, in bytes
@@ -1630,12 +1635,13 @@ EXPORT_SYMBOL_GPL(svc_max_payload);
* Returns zero on success, or a negative errno if a permanent
* error occurred.
*/
-int svc_encode_read_payload(struct svc_rqst *rqstp, unsigned int offset,
- unsigned int length)
+int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
{
- return rqstp->rq_xprt->xpt_ops->xpo_read_payload(rqstp, offset, length);
+ return rqstp->rq_xprt->xpt_ops->xpo_result_payload(rqstp, offset,
+ length);
}
-EXPORT_SYMBOL_GPL(svc_encode_read_payload);
+EXPORT_SYMBOL_GPL(svc_encode_result_payload);
/**
* svc_fill_write_vector - Construct data argument for VFS write call
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 43cf8dbde898..dcc50ae54550 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -813,8 +813,6 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
len = svc_deferred_recv(rqstp);
else
len = xprt->xpt_ops->xpo_recvfrom(rqstp);
- if (len > 0)
- trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg);
rqstp->rq_stime = ktime_get();
rqstp->rq_reserved = serv->sv_max_mesg;
atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
@@ -859,6 +857,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
err = -EAGAIN;
if (len <= 0)
goto out_release;
+ trace_svc_xdr_recvfrom(&rqstp->rq_arg);
clear_bit(XPT_OLD, &xprt->xpt_flags);
@@ -868,7 +867,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
if (serv->sv_stats)
serv->sv_stats->netcnt++;
- trace_svc_recv(rqstp, len);
return len;
out_release:
rqstp->rq_res.len = 0;
@@ -906,7 +904,7 @@ int svc_send(struct svc_rqst *rqstp)
xb->len = xb->head[0].iov_len +
xb->page_len +
xb->tail[0].iov_len;
- trace_svc_xdr_sendto(rqstp, xb);
+ trace_svc_xdr_sendto(rqstp->rq_xid, xb);
trace_svc_stats_latency(rqstp);
len = xprt->xpt_ops->xpo_sendto(rqstp);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c2752e2b9ce3..5a809c64dc7b 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -181,8 +181,8 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
}
}
-static int svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int offset,
- unsigned int length)
+static int svc_sock_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
{
return 0;
}
@@ -635,7 +635,7 @@ static const struct svc_xprt_ops svc_udp_ops = {
.xpo_create = svc_udp_create,
.xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto,
- .xpo_read_payload = svc_sock_read_payload,
+ .xpo_result_payload = svc_sock_result_payload,
.xpo_release_rqst = svc_udp_release_rqst,
.xpo_detach = svc_sock_detach,
.xpo_free = svc_sock_free,
@@ -1062,6 +1062,91 @@ err_noclose:
return 0; /* record not complete */
}
+static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
+ int flags)
+{
+ return kernel_sendpage(sock, virt_to_page(vec->iov_base),
+ offset_in_page(vec->iov_base),
+ vec->iov_len, flags);
+}
+
+/*
+ * kernel_sendpage() is used exclusively to reduce the number of
+ * copy operations in this path. Therefore the caller must ensure
+ * that the pages backing @xdr are unchanging.
+ *
+ * In addition, the logic assumes that * .bv_len is never larger
+ * than PAGE_SIZE.
+ */
+static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, rpc_fraghdr marker,
+ unsigned int *sentp)
+{
+ const struct kvec *head = xdr->head;
+ const struct kvec *tail = xdr->tail;
+ struct kvec rm = {
+ .iov_base = &marker,
+ .iov_len = sizeof(marker),
+ };
+ int flags, ret;
+
+ *sentp = 0;
+ xdr_alloc_bvec(xdr, GFP_KERNEL);
+
+ msg->msg_flags = MSG_MORE;
+ ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (ret != rm.iov_len)
+ return -EAGAIN;
+
+ flags = head->iov_len < xdr->len ? MSG_MORE | MSG_SENDPAGE_NOTLAST : 0;
+ ret = svc_tcp_send_kvec(sock, head, flags);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (ret != head->iov_len)
+ goto out;
+
+ if (xdr->page_len) {
+ unsigned int offset, len, remaining;
+ struct bio_vec *bvec;
+
+ bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT);
+ offset = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ while (remaining > 0) {
+ if (remaining <= PAGE_SIZE && tail->iov_len == 0)
+ flags = 0;
+
+ len = min(remaining, bvec->bv_len - offset);
+ ret = kernel_sendpage(sock, bvec->bv_page,
+ bvec->bv_offset + offset,
+ len, flags);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (ret != len)
+ goto out;
+ remaining -= len;
+ offset = 0;
+ bvec++;
+ }
+ }
+
+ if (tail->iov_len) {
+ ret = svc_tcp_send_kvec(sock, tail, 0);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ }
+
+out:
+ return 0;
+}
+
/**
* svc_tcp_sendto - Send out a reply on a TCP socket
* @rqstp: completed svc_rqst
@@ -1089,7 +1174,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
mutex_lock(&xprt->xpt_mutex);
if (svc_xprt_is_dead(xprt))
goto out_notconn;
- err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
+ err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent);
xdr_free_bvec(xdr);
trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
if (err < 0 || sent != (xdr->len + sizeof(marker)))
@@ -1123,7 +1208,7 @@ static const struct svc_xprt_ops svc_tcp_ops = {
.xpo_create = svc_tcp_create,
.xpo_recvfrom = svc_tcp_recvfrom,
.xpo_sendto = svc_tcp_sendto,
- .xpo_read_payload = svc_sock_read_payload,
+ .xpo_result_payload = svc_sock_result_payload,
.xpo_release_rqst = svc_tcp_release_rqst,
.xpo_detach = svc_tcp_sock_detach,
.xpo_free = svc_sock_free,
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 999eee1ed61c..3aad6ef18504 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -63,14 +63,21 @@ static int proc_do_xprt(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
char tmpbuf[256];
- size_t len;
+ ssize_t len;
- if ((*ppos && !write) || !*lenp) {
+ if (write || *ppos) {
*lenp = 0;
return 0;
}
len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
- return memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
+ len = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
+
+ if (len < 0) {
+ *lenp = 0;
+ return -EINVAL;
+ }
+ *lenp = len;
+ return 0;
}
static int
@@ -108,8 +115,10 @@ proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp,
left -= (s - tmpbuf);
if (left && !isspace(*s))
return -EINVAL;
- while (left && isspace(*s))
- left--, s++;
+ while (left && isspace(*s)) {
+ left--;
+ s++;
+ }
} else
left = 0;
*(unsigned int *) table->data = value;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index be11d672b5b9..3964ff74ee51 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -19,6 +19,9 @@
#include <linux/bvec.h>
#include <trace/events/sunrpc.h>
+static void _copy_to_pages(struct page **, size_t, const char *, size_t);
+
+
/*
* XDR functions for basic NFS types
*/
@@ -120,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
* @len: length of string, in bytes
*
*/
-void
-xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+void xdr_terminate_string(const struct xdr_buf *buf, const u32 len)
{
char *kaddr;
@@ -131,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
}
EXPORT_SYMBOL_GPL(xdr_terminate_string);
-size_t
-xdr_buf_pagecount(struct xdr_buf *buf)
+size_t xdr_buf_pagecount(const struct xdr_buf *buf)
{
if (!buf->page_len)
return 0;
@@ -190,9 +191,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
tail->iov_base = buf + offset;
tail->iov_len = buflen - offset;
- if ((xdr->page_len & 3) == 0)
- tail->iov_len -= sizeof(__be32);
-
xdr->buflen += len;
}
EXPORT_SYMBOL_GPL(xdr_inline_pages);
@@ -202,6 +200,71 @@ EXPORT_SYMBOL_GPL(xdr_inline_pages);
*/
/**
+ * _shift_data_left_pages
+ * @pages: vector of pages containing both the source and dest memory area.
+ * @pgto_base: page vector address of destination
+ * @pgfrom_base: page vector address of source
+ * @len: number of bytes to copy
+ *
+ * Note: the addresses pgto_base and pgfrom_base are both calculated in
+ * the same way:
+ * if a memory area starts at byte 'base' in page 'pages[i]',
+ * then its address is given as (i << PAGE_CACHE_SHIFT) + base
+ * Alse note: pgto_base must be < pgfrom_base, but the memory areas
+ * they point to may overlap.
+ */
+static void
+_shift_data_left_pages(struct page **pages, size_t pgto_base,
+ size_t pgfrom_base, size_t len)
+{
+ struct page **pgfrom, **pgto;
+ char *vfrom, *vto;
+ size_t copy;
+
+ BUG_ON(pgfrom_base <= pgto_base);
+
+ if (!len)
+ return;
+
+ pgto = pages + (pgto_base >> PAGE_SHIFT);
+ pgfrom = pages + (pgfrom_base >> PAGE_SHIFT);
+
+ pgto_base &= ~PAGE_MASK;
+ pgfrom_base &= ~PAGE_MASK;
+
+ do {
+ if (pgto_base >= PAGE_SIZE) {
+ pgto_base = 0;
+ pgto++;
+ }
+ if (pgfrom_base >= PAGE_SIZE){
+ pgfrom_base = 0;
+ pgfrom++;
+ }
+
+ copy = len;
+ if (copy > (PAGE_SIZE - pgto_base))
+ copy = PAGE_SIZE - pgto_base;
+ if (copy > (PAGE_SIZE - pgfrom_base))
+ copy = PAGE_SIZE - pgfrom_base;
+
+ vto = kmap_atomic(*pgto);
+ if (*pgto != *pgfrom) {
+ vfrom = kmap_atomic(*pgfrom);
+ memcpy(vto + pgto_base, vfrom + pgfrom_base, copy);
+ kunmap_atomic(vfrom);
+ } else
+ memmove(vto + pgto_base, vto + pgfrom_base, copy);
+ flush_dcache_page(*pgto);
+ kunmap_atomic(vto);
+
+ pgto_base += copy;
+ pgfrom_base += copy;
+
+ } while ((len -= copy) != 0);
+}
+
+/**
* _shift_data_right_pages
* @pages: vector of pages containing both the source and dest memory area.
* @pgto_base: page vector address of destination
@@ -225,6 +288,9 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
BUG_ON(pgto_base <= pgfrom_base);
+ if (!len)
+ return;
+
pgto_base += len;
pgfrom_base += len;
@@ -283,6 +349,9 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
char *vto;
size_t copy;
+ if (!len)
+ return;
+
pgto = pages + (pgbase >> PAGE_SHIFT);
pgbase &= ~PAGE_MASK;
@@ -327,6 +396,9 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
char *vfrom;
size_t copy;
+ if (!len)
+ return;
+
pgfrom = pages + (pgbase >> PAGE_SHIFT);
pgbase &= ~PAGE_MASK;
@@ -350,148 +422,423 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
}
EXPORT_SYMBOL_GPL(_copy_from_pages);
+static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base,
+ unsigned int len)
+{
+ if (base >= iov->iov_len)
+ return;
+ if (len > iov->iov_len - base)
+ len = iov->iov_len - base;
+ memset(iov->iov_base + base, 0, len);
+}
+
/**
- * xdr_shrink_bufhead
+ * xdr_buf_pages_zero
* @buf: xdr_buf
- * @len: bytes to remove from buf->head[0]
- *
- * Shrinks XDR buffer's header kvec buf->head[0] by
- * 'len' bytes. The extra data is not lost, but is instead
- * moved into the inlined pages and/or the tail.
+ * @pgbase: beginning offset
+ * @len: length
*/
-static unsigned int
-xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
+static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase,
+ unsigned int len)
{
- struct kvec *head, *tail;
- size_t copy, offs;
- unsigned int pglen = buf->page_len;
- unsigned int result;
+ struct page **pages = buf->pages;
+ struct page **page;
+ char *vpage;
+ unsigned int zero;
- result = 0;
- tail = buf->tail;
- head = buf->head;
+ if (!len)
+ return;
+ if (pgbase >= buf->page_len) {
+ xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len);
+ return;
+ }
+ if (pgbase + len > buf->page_len) {
+ xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len);
+ len = buf->page_len - pgbase;
+ }
- WARN_ON_ONCE(len > head->iov_len);
- if (len > head->iov_len)
- len = head->iov_len;
-
- /* Shift the tail first */
- if (tail->iov_len != 0) {
- if (tail->iov_len > len) {
- copy = tail->iov_len - len;
- memmove((char *)tail->iov_base + len,
- tail->iov_base, copy);
- result += copy;
- }
- /* Copy from the inlined pages into the tail */
- copy = len;
- if (copy > pglen)
- copy = pglen;
- offs = len - copy;
- if (offs >= tail->iov_len)
- copy = 0;
- else if (copy > tail->iov_len - offs)
- copy = tail->iov_len - offs;
- if (copy != 0) {
- _copy_from_pages((char *)tail->iov_base + offs,
- buf->pages,
- buf->page_base + pglen + offs - len,
- copy);
- result += copy;
+ pgbase += buf->page_base;
+
+ page = pages + (pgbase >> PAGE_SHIFT);
+ pgbase &= ~PAGE_MASK;
+
+ do {
+ zero = PAGE_SIZE - pgbase;
+ if (zero > len)
+ zero = len;
+
+ vpage = kmap_atomic(*page);
+ memset(vpage + pgbase, 0, zero);
+ kunmap_atomic(vpage);
+
+ flush_dcache_page(*page);
+ pgbase = 0;
+ page++;
+
+ } while ((len -= zero) != 0);
+}
+
+static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf,
+ unsigned int buflen, gfp_t gfp)
+{
+ unsigned int i, npages, pagelen;
+
+ if (!(buf->flags & XDRBUF_SPARSE_PAGES))
+ return buflen;
+ if (buflen <= buf->head->iov_len)
+ return buflen;
+ pagelen = buflen - buf->head->iov_len;
+ if (pagelen > buf->page_len)
+ pagelen = buf->page_len;
+ npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < npages; i++) {
+ if (!buf->pages[i])
+ continue;
+ buf->pages[i] = alloc_page(gfp);
+ if (likely(buf->pages[i]))
+ continue;
+ buflen -= pagelen;
+ pagelen = i << PAGE_SHIFT;
+ if (pagelen > buf->page_base)
+ buflen += pagelen - buf->page_base;
+ break;
+ }
+ return buflen;
+}
+
+static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len)
+{
+ struct kvec *head = buf->head;
+ struct kvec *tail = buf->tail;
+ unsigned int sum = head->iov_len + buf->page_len + tail->iov_len;
+ unsigned int free_space, newlen;
+
+ if (sum > buf->len) {
+ free_space = min_t(unsigned int, sum - buf->len, len);
+ newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space,
+ GFP_KERNEL);
+ free_space = newlen - buf->len;
+ buf->len = newlen;
+ len -= free_space;
+ if (!len)
+ return;
+ }
+
+ if (buf->buflen > sum) {
+ /* Expand the tail buffer */
+ free_space = min_t(unsigned int, buf->buflen - sum, len);
+ tail->iov_len += free_space;
+ buf->len += free_space;
+ }
+}
+
+static void xdr_buf_tail_copy_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+ unsigned int to = base + shift;
+
+ if (to >= tail->iov_len)
+ return;
+ if (len + to > tail->iov_len)
+ len = tail->iov_len - to;
+ memmove(tail->iov_base + to, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+ unsigned int to = base + shift;
+ unsigned int pglen = 0;
+ unsigned int talen = 0, tato = 0;
+
+ if (base >= buf->page_len)
+ return;
+ if (len > buf->page_len - base)
+ len = buf->page_len - base;
+ if (to >= buf->page_len) {
+ tato = to - buf->page_len;
+ if (tail->iov_len >= len + tato)
+ talen = len;
+ else if (tail->iov_len > tato)
+ talen = tail->iov_len - tato;
+ } else if (len + to >= buf->page_len) {
+ pglen = buf->page_len - to;
+ talen = len - pglen;
+ if (talen > tail->iov_len)
+ talen = tail->iov_len;
+ } else
+ pglen = len;
+
+ _copy_from_pages(tail->iov_base + tato, buf->pages,
+ buf->page_base + base + pglen, talen);
+ _shift_data_right_pages(buf->pages, buf->page_base + to,
+ buf->page_base + base, pglen);
+}
+
+static void xdr_buf_head_copy_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *head = buf->head;
+ const struct kvec *tail = buf->tail;
+ unsigned int to = base + shift;
+ unsigned int pglen = 0, pgto = 0;
+ unsigned int talen = 0, tato = 0;
+
+ if (base >= head->iov_len)
+ return;
+ if (len > head->iov_len - base)
+ len = head->iov_len - base;
+ if (to >= buf->page_len + head->iov_len) {
+ tato = to - buf->page_len - head->iov_len;
+ talen = len;
+ } else if (to >= head->iov_len) {
+ pgto = to - head->iov_len;
+ pglen = len;
+ if (pgto + pglen > buf->page_len) {
+ talen = pgto + pglen - buf->page_len;
+ pglen -= talen;
}
- /* Do we also need to copy data from the head into the tail ? */
- if (len > pglen) {
- offs = copy = len - pglen;
- if (copy > tail->iov_len)
- copy = tail->iov_len;
- memcpy(tail->iov_base,
- (char *)head->iov_base +
- head->iov_len - offs,
- copy);
- result += copy;
+ } else {
+ pglen = len - to;
+ if (pglen > buf->page_len) {
+ talen = pglen - buf->page_len;
+ pglen = buf->page_len;
}
}
- /* Now handle pages */
- if (pglen != 0) {
- if (pglen > len)
- _shift_data_right_pages(buf->pages,
- buf->page_base + len,
- buf->page_base,
- pglen - len);
- copy = len;
- if (len > pglen)
- copy = pglen;
- _copy_to_pages(buf->pages, buf->page_base,
- (char *)head->iov_base + head->iov_len - len,
- copy);
- result += copy;
+
+ len -= talen;
+ base += len;
+ if (talen + tato > tail->iov_len)
+ talen = tail->iov_len > tato ? tail->iov_len - tato : 0;
+ memcpy(tail->iov_base + tato, head->iov_base + base, talen);
+
+ len -= pglen;
+ base -= pglen;
+ _copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base,
+ pglen);
+
+ base -= len;
+ memmove(head->iov_base + to, head->iov_base + base, len);
+}
+
+static void xdr_buf_tail_shift_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+
+ if (base >= tail->iov_len || !shift || !len)
+ return;
+ xdr_buf_tail_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ if (!shift || !len)
+ return;
+ if (base >= buf->page_len) {
+ xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift);
+ return;
+ }
+ if (base + len > buf->page_len)
+ xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len,
+ shift);
+ xdr_buf_pages_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_head_shift_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *head = buf->head;
+
+ if (!shift)
+ return;
+ if (base >= head->iov_len) {
+ xdr_buf_pages_shift_right(buf, head->iov_len - base, len,
+ shift);
+ return;
+ }
+ if (base + len > head->iov_len)
+ xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len,
+ shift);
+ xdr_buf_head_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
+ unsigned int len, unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+
+ if (base >= tail->iov_len)
+ return;
+ if (len > tail->iov_len - base)
+ len = tail->iov_len - base;
+ /* Shift data into head */
+ if (shift > buf->page_len + base) {
+ const struct kvec *head = buf->head;
+ unsigned int hdto =
+ head->iov_len + buf->page_len + base - shift;
+ unsigned int hdlen = len;
+
+ if (WARN_ONCE(shift > head->iov_len + buf->page_len + base,
+ "SUNRPC: Misaligned data.\n"))
+ return;
+ if (hdto + hdlen > head->iov_len)
+ hdlen = head->iov_len - hdto;
+ memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen);
+ base += hdlen;
+ len -= hdlen;
+ if (!len)
+ return;
+ }
+ /* Shift data into pages */
+ if (shift > base) {
+ unsigned int pgto = buf->page_len + base - shift;
+ unsigned int pglen = len;
+
+ if (pgto + pglen > buf->page_len)
+ pglen = buf->page_len - pgto;
+ _copy_to_pages(buf->pages, buf->page_base + pgto,
+ tail->iov_base + base, pglen);
+ base += pglen;
+ len -= pglen;
+ if (!len)
+ return;
+ }
+ memmove(tail->iov_base + base - shift, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_left(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ unsigned int pgto;
+
+ if (base >= buf->page_len)
+ return;
+ if (len > buf->page_len - base)
+ len = buf->page_len - base;
+ /* Shift data into head */
+ if (shift > base) {
+ const struct kvec *head = buf->head;
+ unsigned int hdto = head->iov_len + base - shift;
+ unsigned int hdlen = len;
+
+ if (WARN_ONCE(shift > head->iov_len + base,
+ "SUNRPC: Misaligned data.\n"))
+ return;
+ if (hdto + hdlen > head->iov_len)
+ hdlen = head->iov_len - hdto;
+ _copy_from_pages(head->iov_base + hdto, buf->pages,
+ buf->page_base + base, hdlen);
+ base += hdlen;
+ len -= hdlen;
+ if (!len)
+ return;
+ }
+ pgto = base - shift;
+ _shift_data_left_pages(buf->pages, buf->page_base + pgto,
+ buf->page_base + base, len);
+}
+
+static void xdr_buf_tail_shift_left(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ if (!shift || !len)
+ return;
+ xdr_buf_tail_copy_left(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_left(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ if (!shift || !len)
+ return;
+ if (base >= buf->page_len) {
+ xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift);
+ return;
}
- head->iov_len -= len;
- buf->buflen -= len;
- /* Have we truncated the message? */
- if (buf->len > buf->buflen)
- buf->len = buf->buflen;
+ xdr_buf_pages_copy_left(buf, base, len, shift);
+ len += base;
+ if (len <= buf->page_len)
+ return;
+ xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift);
+}
- return result;
+/**
+ * xdr_shrink_bufhead
+ * @buf: xdr_buf
+ * @len: new length of buf->head[0]
+ *
+ * Shrinks XDR buffer's header kvec buf->head[0], setting it to
+ * 'len' bytes. The extra data is not lost, but is instead
+ * moved into the inlined pages and/or the tail.
+ */
+static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len)
+{
+ struct kvec *head = buf->head;
+ unsigned int shift, buflen = max(buf->len, len);
+
+ WARN_ON_ONCE(len > head->iov_len);
+ if (head->iov_len > buflen) {
+ buf->buflen -= head->iov_len - buflen;
+ head->iov_len = buflen;
+ }
+ if (len >= head->iov_len)
+ return 0;
+ shift = head->iov_len - len;
+ xdr_buf_try_expand(buf, shift);
+ xdr_buf_head_shift_right(buf, len, buflen - len, shift);
+ head->iov_len = len;
+ buf->buflen -= shift;
+ buf->len -= shift;
+ return shift;
}
/**
- * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes
+ * xdr_shrink_pagelen - shrinks buf->pages to @len bytes
* @buf: xdr_buf
- * @len: bytes to remove from buf->pages
+ * @len: new page buffer length
*
* The extra data is not lost, but is instead moved into buf->tail.
* Returns the actual number of bytes moved.
*/
-static unsigned int
-xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
+static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len)
{
- struct kvec *tail;
- size_t copy;
- unsigned int pglen = buf->page_len;
- unsigned int tailbuf_len;
- unsigned int result;
-
- result = 0;
- tail = buf->tail;
- if (len > buf->page_len)
- len = buf-> page_len;
- tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
-
- /* Shift the tail first */
- if (tailbuf_len != 0) {
- unsigned int free_space = tailbuf_len - tail->iov_len;
-
- if (len < free_space)
- free_space = len;
- tail->iov_len += free_space;
-
- copy = len;
- if (tail->iov_len > len) {
- char *p = (char *)tail->iov_base + len;
- memmove(p, tail->iov_base, tail->iov_len - len);
- result += tail->iov_len - len;
- } else
- copy = tail->iov_len;
- /* Copy from the inlined pages into the tail */
- _copy_from_pages((char *)tail->iov_base,
- buf->pages, buf->page_base + pglen - len,
- copy);
- result += copy;
+ unsigned int shift, buflen = buf->len - buf->head->iov_len;
+
+ WARN_ON_ONCE(len > buf->page_len);
+ if (buf->head->iov_len >= buf->len || len > buflen)
+ buflen = len;
+ if (buf->page_len > buflen) {
+ buf->buflen -= buf->page_len - buflen;
+ buf->page_len = buflen;
}
- buf->page_len -= len;
- buf->buflen -= len;
- /* Have we truncated the message? */
- if (buf->len > buf->buflen)
- buf->len = buf->buflen;
-
- return result;
+ if (len >= buf->page_len)
+ return 0;
+ shift = buf->page_len - len;
+ xdr_buf_try_expand(buf, shift);
+ xdr_buf_pages_shift_right(buf, len, buflen - len, shift);
+ buf->page_len = len;
+ buf->len -= shift;
+ buf->buflen -= shift;
+ return shift;
}
void
xdr_shift_buf(struct xdr_buf *buf, size_t len)
{
- xdr_shrink_bufhead(buf, len);
+ xdr_shrink_bufhead(buf, buf->head->iov_len - len);
}
EXPORT_SYMBOL_GPL(xdr_shift_buf);
@@ -505,6 +852,31 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr)
}
EXPORT_SYMBOL_GPL(xdr_stream_pos);
+static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+ unsigned int blen = xdr->buf->len;
+
+ xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0;
+}
+
+static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+ xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len);
+}
+
+/**
+ * xdr_page_pos - Return the current offset from the start of the xdr pages
+ * @xdr: pointer to struct xdr_stream
+ */
+unsigned int xdr_page_pos(const struct xdr_stream *xdr)
+{
+ unsigned int pos = xdr_stream_pos(xdr);
+
+ WARN_ON(pos < xdr->buf->head[0].iov_len);
+ return pos - xdr->buf->head[0].iov_len;
+}
+EXPORT_SYMBOL_GPL(xdr_page_pos);
+
/**
* xdr_init_encode - Initialize a struct xdr_stream for sending data.
* @xdr: pointer to xdr_stream struct
@@ -525,7 +897,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
struct kvec *iov = buf->head;
int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
- xdr_set_scratch_buffer(xdr, NULL, 0);
+ xdr_reset_scratch_buffer(xdr);
BUG_ON(scratch_len < 0);
xdr->buf = buf;
xdr->iov = iov;
@@ -569,7 +941,7 @@ inline void xdr_commit_encode(struct xdr_stream *xdr)
page = page_address(*xdr->page_ptr);
memcpy(xdr->scratch.iov_base, page, shift);
memmove(page, page + shift, (void *)xdr->p - page);
- xdr->scratch.iov_len = 0;
+ xdr_reset_scratch_buffer(xdr);
}
EXPORT_SYMBOL_GPL(xdr_commit_encode);
@@ -599,8 +971,7 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
* the "scratch" iov to track any temporarily unused fragment of
* space at the end of the previous buffer:
*/
- xdr->scratch.iov_base = xdr->p;
- xdr->scratch.iov_len = frag1bytes;
+ xdr_set_scratch_buffer(xdr, xdr->p, frag1bytes);
p = page_address(*xdr->page_ptr);
/*
* Note this is where the next encode will start after we've
@@ -648,6 +1019,51 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
}
EXPORT_SYMBOL_GPL(xdr_reserve_space);
+
+/**
+ * xdr_reserve_space_vec - Reserves a large amount of buffer space for sending
+ * @xdr: pointer to xdr_stream
+ * @vec: pointer to a kvec array
+ * @nbytes: number of bytes to reserve
+ *
+ * Reserves enough buffer space to encode 'nbytes' of data and stores the
+ * pointers in 'vec'. The size argument passed to xdr_reserve_space() is
+ * determined based on the number of bytes remaining in the current page to
+ * avoid invalidating iov_base pointers when xdr_commit_encode() is called.
+ */
+int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes)
+{
+ int thislen;
+ int v = 0;
+ __be32 *p;
+
+ /*
+ * svcrdma requires every READ payload to start somewhere
+ * in xdr->pages.
+ */
+ if (xdr->iov == xdr->buf->head) {
+ xdr->iov = NULL;
+ xdr->end = xdr->p;
+ }
+
+ while (nbytes) {
+ thislen = xdr->buf->page_len % PAGE_SIZE;
+ thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen);
+
+ p = xdr_reserve_space(xdr, thislen);
+ if (!p)
+ return -EIO;
+
+ vec[v].iov_base = p;
+ vec[v].iov_len = thislen;
+ v++;
+ nbytes -= thislen;
+ }
+
+ return v;
+}
+EXPORT_SYMBOL_GPL(xdr_reserve_space_vec);
+
/**
* xdr_truncate_encode - truncate an encode buffer
* @xdr: pointer to xdr_stream
@@ -658,7 +1074,7 @@ EXPORT_SYMBOL_GPL(xdr_reserve_space);
* head, tail, and page lengths are adjusted to correspond.
*
* If this means moving xdr->p to a different buffer, we assume that
- * that the end pointer should be set to the end of the current page,
+ * the end pointer should be set to the end of the current page,
* except in the case of the head buffer when we assume the head
* buffer's current length represents the end of the available buffer.
*
@@ -781,19 +1197,31 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b
}
EXPORT_SYMBOL_GPL(xdr_write_pages);
-static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
- unsigned int len)
+static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
+ unsigned int base, unsigned int len)
{
if (len > iov->iov_len)
len = iov->iov_len;
- xdr->p = (__be32*)iov->iov_base;
+ if (unlikely(base > len))
+ base = len;
+ xdr->p = (__be32*)(iov->iov_base + base);
xdr->end = (__be32*)(iov->iov_base + len);
xdr->iov = iov;
xdr->page_ptr = NULL;
+ return len - base;
+}
+
+static unsigned int xdr_set_tail_base(struct xdr_stream *xdr,
+ unsigned int base, unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+
+ xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len);
+ return xdr_set_iov(xdr, buf->tail, base, len);
}
-static int xdr_set_page_base(struct xdr_stream *xdr,
- unsigned int base, unsigned int len)
+static unsigned int xdr_set_page_base(struct xdr_stream *xdr,
+ unsigned int base, unsigned int len)
{
unsigned int pgnr;
unsigned int maxlen;
@@ -802,12 +1230,15 @@ static int xdr_set_page_base(struct xdr_stream *xdr,
void *kaddr;
maxlen = xdr->buf->page_len;
- if (base >= maxlen)
- return -EINVAL;
- maxlen -= base;
+ if (base >= maxlen) {
+ base = maxlen;
+ maxlen = 0;
+ } else
+ maxlen -= base;
if (len > maxlen)
len = maxlen;
+ xdr_stream_page_set_pos(xdr, base);
base += xdr->buf->page_base;
pgnr = base >> PAGE_SHIFT;
@@ -822,7 +1253,16 @@ static int xdr_set_page_base(struct xdr_stream *xdr,
pgend = PAGE_SIZE;
xdr->end = (__be32*)(kaddr + pgend);
xdr->iov = NULL;
- return 0;
+ return len;
+}
+
+static void xdr_set_page(struct xdr_stream *xdr, unsigned int base,
+ unsigned int len)
+{
+ if (xdr_set_page_base(xdr, base, len) == 0) {
+ base -= xdr->buf->page_len;
+ xdr_set_tail_base(xdr, base, len);
+ }
}
static void xdr_set_next_page(struct xdr_stream *xdr)
@@ -831,19 +1271,18 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
newbase -= xdr->buf->page_base;
-
- if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
+ if (newbase < xdr->buf->page_len)
+ xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr));
+ else
+ xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr));
}
static bool xdr_set_next_buffer(struct xdr_stream *xdr)
{
if (xdr->page_ptr != NULL)
xdr_set_next_page(xdr);
- else if (xdr->iov == xdr->buf->head) {
- if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
- }
+ else if (xdr->iov == xdr->buf->head)
+ xdr_set_page(xdr, 0, xdr_stream_remaining(xdr));
return xdr->p != xdr->end;
}
@@ -858,15 +1297,11 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
struct rpc_rqst *rqst)
{
xdr->buf = buf;
- xdr->scratch.iov_base = NULL;
- xdr->scratch.iov_len = 0;
+ xdr_reset_scratch_buffer(xdr);
xdr->nwords = XDR_QUADLEN(buf->len);
- if (buf->head[0].iov_len != 0)
- xdr_set_iov(xdr, buf->head, buf->len);
- else if (buf->page_len != 0)
- xdr_set_page_base(xdr, 0, buf->len);
- else
- xdr_set_iov(xdr, buf->head, buf->len);
+ if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 &&
+ xdr_set_page_base(xdr, 0, buf->len) == 0)
+ xdr_set_iov(xdr, buf->tail, 0, buf->len);
if (p != NULL && p > xdr->p && xdr->end >= p) {
xdr->nwords -= p - xdr->p;
xdr->p = p;
@@ -907,24 +1342,6 @@ static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
return p;
}
-/**
- * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
- * @xdr: pointer to xdr_stream struct
- * @buf: pointer to an empty buffer
- * @buflen: size of 'buf'
- *
- * The scratch buffer is used when decoding from an array of pages.
- * If an xdr_inline_decode() call spans across page boundaries, then
- * we copy the data into the scratch buffer in order to allow linear
- * access.
- */
-void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
-{
- xdr->scratch.iov_base = buf;
- xdr->scratch.iov_len = buflen;
-}
-EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
-
static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
{
__be32 *p;
@@ -979,26 +1396,31 @@ out_overflow:
}
EXPORT_SYMBOL_GPL(xdr_inline_decode);
-static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
+static void xdr_realign_pages(struct xdr_stream *xdr)
{
struct xdr_buf *buf = xdr->buf;
- struct kvec *iov;
- unsigned int nwords = XDR_QUADLEN(len);
+ struct kvec *iov = buf->head;
unsigned int cur = xdr_stream_pos(xdr);
- unsigned int copied, offset;
-
- if (xdr->nwords == 0)
- return 0;
+ unsigned int copied;
/* Realign pages to current pointer position */
- iov = buf->head;
if (iov->iov_len > cur) {
- offset = iov->iov_len - cur;
- copied = xdr_shrink_bufhead(buf, offset);
- trace_rpc_xdr_alignment(xdr, offset, copied);
- xdr->nwords = XDR_QUADLEN(buf->len - cur);
+ copied = xdr_shrink_bufhead(buf, cur);
+ trace_rpc_xdr_alignment(xdr, cur, copied);
+ xdr_set_page(xdr, 0, buf->page_len);
}
+}
+
+static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ unsigned int nwords = XDR_QUADLEN(len);
+ unsigned int copied;
+
+ if (xdr->nwords == 0)
+ return 0;
+ xdr_realign_pages(xdr);
if (nwords > xdr->nwords) {
nwords = xdr->nwords;
len = nwords << 2;
@@ -1007,56 +1429,107 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
len = buf->page_len;
else if (nwords < xdr->nwords) {
/* Truncate page data and move it into the tail */
- offset = buf->page_len - len;
- copied = xdr_shrink_pagelen(buf, offset);
- trace_rpc_xdr_alignment(xdr, offset, copied);
- xdr->nwords = XDR_QUADLEN(buf->len - cur);
+ copied = xdr_shrink_pagelen(buf, len);
+ trace_rpc_xdr_alignment(xdr, len, copied);
}
return len;
}
/**
- * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position
+ * xdr_read_pages - align page-based XDR data to current pointer position
* @xdr: pointer to xdr_stream struct
* @len: number of bytes of page data
*
* Moves data beyond the current pointer position from the XDR head[] buffer
- * into the page list. Any data that lies beyond current position + "len"
- * bytes is moved into the XDR tail[].
+ * into the page list. Any data that lies beyond current position + @len
+ * bytes is moved into the XDR tail[]. The xdr_stream current position is
+ * then advanced past that data to align to the next XDR object in the tail.
*
* Returns the number of XDR encoded bytes now contained in the pages
*/
unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
{
- struct xdr_buf *buf = xdr->buf;
- struct kvec *iov;
- unsigned int nwords;
- unsigned int end;
- unsigned int padding;
+ unsigned int nwords = XDR_QUADLEN(len);
+ unsigned int base, end, pglen;
- len = xdr_align_pages(xdr, len);
- if (len == 0)
+ pglen = xdr_align_pages(xdr, nwords << 2);
+ if (pglen == 0)
return 0;
- nwords = XDR_QUADLEN(len);
- padding = (nwords << 2) - len;
- xdr->iov = iov = buf->tail;
- /* Compute remaining message length. */
- end = ((xdr->nwords - nwords) << 2) + padding;
- if (end > iov->iov_len)
- end = iov->iov_len;
- /*
- * Position current pointer at beginning of tail, and
- * set remaining message length.
- */
- xdr->p = (__be32 *)((char *)iov->iov_base + padding);
- xdr->end = (__be32 *)((char *)iov->iov_base + end);
- xdr->page_ptr = NULL;
- xdr->nwords = XDR_QUADLEN(end - padding);
- return len;
+ base = (nwords << 2) - pglen;
+ end = xdr_stream_remaining(xdr) - pglen;
+
+ xdr_set_tail_base(xdr, base, end);
+ return len <= pglen ? len : pglen;
}
EXPORT_SYMBOL_GPL(xdr_read_pages);
+unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
+ unsigned int length)
+{
+ struct xdr_buf *buf = xdr->buf;
+ unsigned int from, bytes, len;
+ unsigned int shift;
+
+ xdr_realign_pages(xdr);
+ from = xdr_page_pos(xdr);
+
+ if (from >= buf->page_len + buf->tail->iov_len)
+ return 0;
+ if (from + buf->head->iov_len >= buf->len)
+ return 0;
+
+ len = buf->len - buf->head->iov_len;
+
+ /* We only shift data left! */
+ if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n",
+ from, offset))
+ return 0;
+ if (WARN_ONCE(offset > buf->page_len,
+ "SUNRPC: buffer overflow. offset=%u, page_len=%u\n",
+ offset, buf->page_len))
+ return 0;
+
+ /* Move page data to the left */
+ shift = from - offset;
+ xdr_buf_pages_shift_left(buf, from, len, shift);
+
+ bytes = xdr_stream_remaining(xdr);
+ if (length > bytes)
+ length = bytes;
+ bytes -= length;
+
+ xdr->buf->len -= shift;
+ xdr_set_page(xdr, offset + length, bytes);
+ return length;
+}
+EXPORT_SYMBOL_GPL(xdr_align_data);
+
+unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset,
+ unsigned int length)
+{
+ struct xdr_buf *buf = xdr->buf;
+ unsigned int from, to, shift;
+
+ xdr_realign_pages(xdr);
+ from = xdr_page_pos(xdr);
+ to = xdr_align_size(offset + length);
+
+ /* Could the hole be behind us? */
+ if (to > from) {
+ unsigned int buflen = buf->len - buf->head->iov_len;
+ shift = to - from;
+ xdr_buf_try_expand(buf, shift);
+ xdr_buf_pages_shift_right(buf, from, buflen, shift);
+ xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
+ } else if (to != from)
+ xdr_align_data(xdr, to, 0);
+ xdr_buf_pages_zero(buf, offset, length);
+
+ return length;
+}
+EXPORT_SYMBOL_GPL(xdr_expand_hole);
+
/**
* xdr_enter_page - decode data from the XDR page
* @xdr: pointer to xdr_stream struct
@@ -1081,8 +1554,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page);
static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
-void
-xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
+void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf)
{
buf->head[0] = *iov;
buf->tail[0] = empty_iov;
@@ -1105,9 +1577,8 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
*
* Returns -1 if base of length are out of bounds.
*/
-int
-xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
- unsigned int base, unsigned int len)
+int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf,
+ unsigned int base, unsigned int len)
{
subbuf->buflen = subbuf->len = len;
if (base < buf->head[0].iov_len) {
@@ -1155,6 +1626,51 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
EXPORT_SYMBOL_GPL(xdr_buf_subsegment);
/**
+ * xdr_stream_subsegment - set @subbuf to a portion of @xdr
+ * @xdr: an xdr_stream set up for decoding
+ * @subbuf: the result buffer
+ * @nbytes: length of @xdr to extract, in bytes
+ *
+ * Sets up @subbuf to represent a portion of @xdr. The portion
+ * starts at the current offset in @xdr, and extends for a length
+ * of @nbytes. If this is successful, @xdr is advanced to the next
+ * position following that portion.
+ *
+ * Return values:
+ * %true: @subbuf has been initialized, and @xdr has been advanced.
+ * %false: a bounds error has occurred
+ */
+bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
+ unsigned int nbytes)
+{
+ unsigned int remaining, offset, len;
+
+ if (xdr_buf_subsegment(xdr->buf, subbuf, xdr_stream_pos(xdr), nbytes))
+ return false;
+
+ if (subbuf->head[0].iov_len)
+ if (!__xdr_inline_decode(xdr, subbuf->head[0].iov_len))
+ return false;
+
+ remaining = subbuf->page_len;
+ offset = subbuf->page_base;
+ while (remaining) {
+ len = min_t(unsigned int, remaining, PAGE_SIZE) - offset;
+
+ if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
+ return false;
+ if (!__xdr_inline_decode(xdr, len))
+ return false;
+
+ remaining -= len;
+ offset = 0;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_subsegment);
+
+/**
* xdr_buf_trim - lop at most "len" bytes off the end of "buf"
* @buf: buf to be trimmed
* @len: number of bytes to reduce "buf" by
@@ -1195,7 +1711,8 @@ fix_len:
}
EXPORT_SYMBOL_GPL(xdr_buf_trim);
-static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf,
+ void *obj, unsigned int len)
{
unsigned int this_len;
@@ -1204,8 +1721,7 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->page_len);
- if (this_len)
- _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
+ _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
@@ -1213,7 +1729,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
}
/* obj is assumed to point to allocated memory of size at least len: */
-int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+ void *obj, unsigned int len)
{
struct xdr_buf subbuf;
int status;
@@ -1226,7 +1743,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u
}
EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf);
-static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf,
+ void *obj, unsigned int len)
{
unsigned int this_len;
@@ -1235,8 +1753,7 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->page_len);
- if (this_len)
- _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
+ _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
@@ -1244,7 +1761,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
}
/* obj is assumed to point to allocated memory of size at least len: */
-int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+ void *obj, unsigned int len)
{
struct xdr_buf subbuf;
int status;
@@ -1257,8 +1775,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un
}
EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf);
-int
-xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
+int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj)
{
__be32 raw;
int status;
@@ -1271,8 +1788,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
}
EXPORT_SYMBOL_GPL(xdr_decode_word);
-int
-xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
+int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj)
{
__be32 raw = cpu_to_be32(obj);
@@ -1281,9 +1797,8 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
EXPORT_SYMBOL_GPL(xdr_encode_word);
/* Returns 0 on success, or else a negative error code. */
-static int
-xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
- struct xdr_array2_desc *desc, int encode)
+static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc, int encode)
{
char *elem = NULL, *c;
unsigned int copied = 0, todo, avail_here;
@@ -1475,9 +1990,8 @@ out:
return err;
}
-int
-xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
- struct xdr_array2_desc *desc)
+int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc)
{
if (base >= buf->len)
return -EINVAL;
@@ -1486,9 +2000,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
}
EXPORT_SYMBOL_GPL(xdr_decode_array2);
-int
-xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
- struct xdr_array2_desc *desc)
+int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc)
{
if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
buf->head->iov_len + buf->page_len + buf->tail->iov_len)
@@ -1498,9 +2011,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
}
EXPORT_SYMBOL_GPL(xdr_encode_array2);
-int
-xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
- int (*actor)(struct scatterlist *, void *), void *data)
+int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset,
+ unsigned int len,
+ int (*actor)(struct scatterlist *, void *), void *data)
{
int i, ret = 0;
unsigned int page_len, thislen, page_offset;
@@ -1668,10 +2181,8 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
if (ret > 0) {
- char *s = kmalloc(ret + 1, gfp_flags);
+ char *s = kmemdup_nul(p, ret, gfp_flags);
if (s != NULL) {
- memcpy(s, p, ret);
- s[ret] = '\0';
*str = s;
return strlen(s);
}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 5a8e47bbfb9f..691ccf8049a4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -151,33 +151,94 @@ out:
}
EXPORT_SYMBOL_GPL(xprt_unregister_transport);
-/**
- * xprt_load_transport - load a transport implementation
- * @transport_name: transport to load
- *
- * Returns:
- * 0: transport successfully loaded
- * -ENOENT: transport module not available
- */
-int xprt_load_transport(const char *transport_name)
+static void
+xprt_class_release(const struct xprt_class *t)
{
- struct xprt_class *t;
- int result;
+ module_put(t->owner);
+}
+
+static const struct xprt_class *
+xprt_class_find_by_ident_locked(int ident)
+{
+ const struct xprt_class *t;
+
+ list_for_each_entry(t, &xprt_list, list) {
+ if (t->ident != ident)
+ continue;
+ if (!try_module_get(t->owner))
+ continue;
+ return t;
+ }
+ return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_ident(int ident)
+{
+ const struct xprt_class *t;
- result = 0;
spin_lock(&xprt_list_lock);
+ t = xprt_class_find_by_ident_locked(ident);
+ spin_unlock(&xprt_list_lock);
+ return t;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid_locked(const char *netid)
+{
+ const struct xprt_class *t;
+ unsigned int i;
+
list_for_each_entry(t, &xprt_list, list) {
- if (strcmp(t->name, transport_name) == 0) {
- spin_unlock(&xprt_list_lock);
- goto out;
+ for (i = 0; t->netid[i][0] != '\0'; i++) {
+ if (strcmp(t->netid[i], netid) != 0)
+ continue;
+ if (!try_module_get(t->owner))
+ continue;
+ return t;
}
}
+ return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid(const char *netid)
+{
+ const struct xprt_class *t;
+
+ spin_lock(&xprt_list_lock);
+ t = xprt_class_find_by_netid_locked(netid);
+ if (!t) {
+ spin_unlock(&xprt_list_lock);
+ request_module("rpc%s", netid);
+ spin_lock(&xprt_list_lock);
+ t = xprt_class_find_by_netid_locked(netid);
+ }
spin_unlock(&xprt_list_lock);
- result = request_module("xprt%s", transport_name);
-out:
- return result;
+ return t;
}
-EXPORT_SYMBOL_GPL(xprt_load_transport);
+
+/**
+ * xprt_find_transport_ident - convert a netid into a transport identifier
+ * @netid: transport to load
+ *
+ * Returns:
+ * > 0: transport identifier
+ * -ENOENT: transport module not available
+ */
+int xprt_find_transport_ident(const char *netid)
+{
+ const struct xprt_class *t;
+ int ret;
+
+ t = xprt_class_find_by_netid(netid);
+ if (!t)
+ return -ENOENT;
+ ret = t->ident;
+ xprt_class_release(t);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xprt_find_transport_ident);
static void xprt_clear_locked(struct rpc_xprt *xprt)
{
@@ -834,8 +895,7 @@ void xprt_connect(struct rpc_task *task)
{
struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
- dprintk("RPC: %5u xprt_connect xprt %p %s connected\n", task->tk_pid,
- xprt, (xprt_connected(xprt) ? "is" : "is not"));
+ trace_xprt_connect(xprt);
if (!xprt_bound(xprt)) {
task->tk_status = -EAGAIN;
@@ -1131,8 +1191,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
-
xprt->stat.recvs++;
req->rq_private_buf.len = copied;
@@ -1269,7 +1327,6 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
/* Note: req is added _before_ pos */
list_add_tail(&req->rq_xmit, &pos->rq_xmit);
INIT_LIST_HEAD(&req->rq_xmit2);
- trace_xprt_enq_xmit(task, 1);
goto out;
}
} else if (RPC_IS_SWAPPER(task)) {
@@ -1281,7 +1338,6 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
/* Note: req is added _before_ pos */
list_add_tail(&req->rq_xmit, &pos->rq_xmit);
INIT_LIST_HEAD(&req->rq_xmit2);
- trace_xprt_enq_xmit(task, 2);
goto out;
}
} else if (!req->rq_seqno) {
@@ -1290,13 +1346,11 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
continue;
list_add_tail(&req->rq_xmit2, &pos->rq_xmit2);
INIT_LIST_HEAD(&req->rq_xmit);
- trace_xprt_enq_xmit(task, 3);
goto out;
}
}
list_add_tail(&req->rq_xmit, &xprt->xmit_queue);
INIT_LIST_HEAD(&req->rq_xmit2);
- trace_xprt_enq_xmit(task, 4);
out:
set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
spin_unlock(&xprt->queue_lock);
@@ -1414,9 +1468,9 @@ bool xprt_prepare_transmit(struct rpc_task *task)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
-
if (!xprt_lock_write(xprt, task)) {
+ trace_xprt_transmit_queued(xprt, task);
+
/* Race breaker: someone may have transmitted us */
if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
rpc_wake_up_queued_task_set_status(&xprt->sending,
@@ -1520,10 +1574,13 @@ xprt_transmit(struct rpc_task *task)
{
struct rpc_rqst *next, *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- int status;
+ int counter, status;
spin_lock(&xprt->queue_lock);
+ counter = 0;
while (!list_empty(&xprt->xmit_queue)) {
+ if (++counter == 20)
+ break;
next = list_first_entry(&xprt->xmit_queue,
struct rpc_rqst, rq_xmit);
xprt_pin_rqst(next);
@@ -1531,7 +1588,6 @@ xprt_transmit(struct rpc_task *task)
status = xprt_request_transmit(next, task);
if (status == -EBADMSG && next != req)
status = 0;
- cond_resched();
spin_lock(&xprt->queue_lock);
xprt_unpin_rqst(next);
if (status == 0) {
@@ -1747,8 +1803,8 @@ xprt_request_init(struct rpc_task *task)
req->rq_rcv_buf.bvec = NULL;
req->rq_release_snd_buf = NULL;
xprt_init_majortimeo(task, req);
- dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
- req, ntohl(req->rq_xid));
+
+ trace_xprt_reserve(req);
}
static void
@@ -1838,7 +1894,6 @@ void xprt_release(struct rpc_task *task)
if (req->rq_release_snd_buf)
req->rq_release_snd_buf(req);
- dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
if (likely(!bc_prealloc(req)))
xprt->ops->free_slot(xprt, req);
else
@@ -1902,21 +1957,17 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
{
struct rpc_xprt *xprt;
- struct xprt_class *t;
+ const struct xprt_class *t;
- spin_lock(&xprt_list_lock);
- list_for_each_entry(t, &xprt_list, list) {
- if (t->ident == args->ident) {
- spin_unlock(&xprt_list_lock);
- goto found;
- }
+ t = xprt_class_find_by_ident(args->ident);
+ if (!t) {
+ dprintk("RPC: transport (%d) not supported\n", args->ident);
+ return ERR_PTR(-EIO);
}
- spin_unlock(&xprt_list_lock);
- dprintk("RPC: transport (%d) not supported\n", args->ident);
- return ERR_PTR(-EIO);
-found:
xprt = t->setup(args);
+ xprt_class_release(t);
+
if (IS_ERR(xprt))
goto out;
if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 8ed0377d7a18..55b21bae866d 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
- module.o
+ svc_rdma_pcl.o module.o
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index c92c1aac270a..946edf2db646 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (c) 2015 Oracle. All rights reserved.
+ * Copyright (c) 2015-2020, Oracle and/or its affiliates.
*
* Support for backward direction RPCs on RPC/RDMA.
*/
@@ -82,7 +82,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
&rqst->rq_snd_buf, rpcrdma_noch_pullup))
return -EIO;
- trace_xprtrdma_cb_reply(rqst);
+ trace_xprtrdma_cb_reply(r_xprt, rqst);
return 0;
}
@@ -260,7 +260,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
*/
req = rpcr_to_rdmar(rqst);
req->rl_reply = rep;
- trace_xprtrdma_cb_call(rqst);
+ trace_xprtrdma_cb_call(r_xprt, rqst);
/* Queue rqst for ULP's callback service */
bc_serv = xprt->bc_serv;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 7f94c9a19fd3..baca49fe83af 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -65,18 +65,23 @@ void frwr_release_mr(struct rpcrdma_mr *mr)
kfree(mr);
}
+static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+{
+ if (mr->mr_device) {
+ trace_xprtrdma_mr_unmap(mr);
+ ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents,
+ mr->mr_dir);
+ mr->mr_device = NULL;
+ }
+}
+
static void frwr_mr_recycle(struct rpcrdma_mr *mr)
{
struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
trace_xprtrdma_mr_recycle(mr);
- if (mr->mr_dir != DMA_NONE) {
- trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
- mr->mr_sg, mr->mr_nents, mr->mr_dir);
- mr->mr_dir = DMA_NONE;
- }
+ frwr_mr_unmap(r_xprt, mr);
spin_lock(&r_xprt->rx_buf.rb_lock);
list_del(&mr->mr_all);
@@ -86,6 +91,16 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr)
frwr_release_mr(mr);
}
+static void frwr_mr_put(struct rpcrdma_mr *mr)
+{
+ frwr_mr_unmap(mr->mr_xprt, mr);
+
+ /* The MR is returned to the req's MR free list instead
+ * of to the xprt's MR free list. No spinlock is needed.
+ */
+ rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
+}
+
/* frwr_reset - Place MRs back on the free list
* @req: request to reset
*
@@ -101,7 +116,7 @@ void frwr_reset(struct rpcrdma_req *req)
struct rpcrdma_mr *mr;
while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
- rpcrdma_mr_put(mr);
+ frwr_mr_put(mr);
}
/**
@@ -124,13 +139,13 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
if (IS_ERR(frmr))
goto out_mr_err;
- sg = kcalloc(depth, sizeof(*sg), GFP_NOFS);
+ sg = kmalloc_array(depth, sizeof(*sg), GFP_NOFS);
if (!sg)
goto out_list_err;
mr->mr_xprt = r_xprt;
mr->frwr.fr_mr = frmr;
- mr->mr_dir = DMA_NONE;
+ mr->mr_device = NULL;
INIT_LIST_HEAD(&mr->mr_list);
init_completion(&mr->frwr.fr_linv_done);
@@ -315,6 +330,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
mr->mr_dir);
if (!dma_nents)
goto out_dmamap_err;
+ mr->mr_device = ep->re_id->device;
ibmr = mr->frwr.fr_mr;
n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
@@ -341,7 +357,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
return seg;
out_dmamap_err:
- mr->mr_dir = DMA_NONE;
trace_xprtrdma_frwr_sgerr(mr, i);
return ERR_PTR(-EIO);
@@ -363,12 +378,21 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
container_of(cqe, struct rpcrdma_frwr, fr_cqe);
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_fastreg(wc, frwr);
+ trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid);
/* The MR will get recycled when the associated req is retransmitted */
rpcrdma_flush_disconnect(cq->cq_context, wc);
}
+static void frwr_cid_init(struct rpcrdma_ep *ep,
+ struct rpcrdma_frwr *frwr)
+{
+ struct rpc_rdma_cid *cid = &frwr->fr_cid;
+
+ cid->ci_queue_id = ep->re_attr.send_cq->res.id;
+ cid->ci_completion_id = frwr->fr_mr->res.id;
+}
+
/**
* frwr_send - post Send WRs containing the RPC Call message
* @r_xprt: controlling transport instance
@@ -385,6 +409,7 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
*/
int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
@@ -395,6 +420,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
frwr = &mr->frwr;
frwr->fr_cqe.done = frwr_wc_fastreg;
+ frwr_cid_init(ep, frwr);
frwr->fr_regwr.wr.next = post_wr;
frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
frwr->fr_regwr.wr.num_sge = 0;
@@ -404,7 +430,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
post_wr = &frwr->fr_regwr.wr;
}
- return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL);
+ return ib_post_send(ep->re_id->qp, post_wr, NULL);
}
/**
@@ -420,18 +446,17 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
list_for_each_entry(mr, mrs, mr_list)
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
- trace_xprtrdma_mr_reminv(mr);
- rpcrdma_mr_put(mr);
+ frwr_mr_put(mr);
break; /* only one invalidated MR per RPC */
}
}
-static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
+static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
{
if (wc->status != IB_WC_SUCCESS)
frwr_mr_recycle(mr);
else
- rpcrdma_mr_put(mr);
+ frwr_mr_put(mr);
}
/**
@@ -448,8 +473,8 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_li(wc, frwr);
- __frwr_release_mr(wc, mr);
+ trace_xprtrdma_wc_li(wc, &frwr->fr_cid);
+ frwr_mr_done(wc, mr);
rpcrdma_flush_disconnect(cq->cq_context, wc);
}
@@ -469,8 +494,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_li_wake(wc, frwr);
- __frwr_release_mr(wc, mr);
+ trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid);
+ frwr_mr_done(wc, mr);
complete(&frwr->fr_linv_done);
rpcrdma_flush_disconnect(cq->cq_context, wc);
@@ -490,6 +515,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *first, **prev, *last;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
const struct ib_send_wr *bad_wr;
struct rpcrdma_frwr *frwr;
struct rpcrdma_mr *mr;
@@ -509,6 +535,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
frwr = &mr->frwr;
frwr->fr_cqe.done = frwr_wc_localinv;
+ frwr_cid_init(ep, frwr);
last = &frwr->fr_invwr;
last->next = NULL;
last->wr_cqe = &frwr->fr_cqe;
@@ -534,7 +561,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
+ rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
/* The final LOCAL_INV WR in the chain is supposed to
* do the wake. If it was never posted, the wake will
@@ -547,7 +574,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
- trace_xprtrdma_post_linv(req, rc);
+ trace_xprtrdma_post_linv_err(req, rc);
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr,
fr_invwr);
@@ -574,10 +601,10 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_li_done(wc, frwr);
- __frwr_release_mr(wc, mr);
+ trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid);
+ frwr_mr_done(wc, mr);
- /* Ensure @rep is generated before __frwr_release_mr */
+ /* Ensure @rep is generated before frwr_mr_done */
smp_rmb();
rpcrdma_complete_rqst(rep);
@@ -597,6 +624,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *first, *last, **prev;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
const struct ib_send_wr *bad_wr;
struct rpcrdma_frwr *frwr;
struct rpcrdma_mr *mr;
@@ -614,6 +642,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
frwr = &mr->frwr;
frwr->fr_cqe.done = frwr_wc_localinv;
+ frwr_cid_init(ep, frwr);
last = &frwr->fr_invwr;
last->next = NULL;
last->wr_cqe = &frwr->fr_cqe;
@@ -639,13 +668,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
+ rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
if (!rc)
return;
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
- trace_xprtrdma_post_linv(req, rc);
+ trace_xprtrdma_post_linv_err(req, rc);
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
mr = container_of(frwr, struct rpcrdma_mr, frwr);
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index 620327c01302..45c5b41ac8dc 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS("svcrdma");
MODULE_ALIAS("xprtrdma");
+MODULE_ALIAS("rpcrdma6");
static void __exit rpc_rdma_cleanup(void)
{
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 0f5120c7668f..8f5d0cb68360 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/*
- * Copyright (c) 2014-2017 Oracle. All rights reserved.
+ * Copyright (c) 2014-2020, Oracle and/or its affiliates.
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
@@ -179,6 +179,31 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
r_xprt->rx_ep->re_max_inline_recv;
}
+/* ACL likes to be lazy in allocating pages. For TCP, these
+ * pages can be allocated during receive processing. Not true
+ * for RDMA, which must always provision receive buffers
+ * up front.
+ */
+static noinline int
+rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
+{
+ struct page **ppages;
+ int len;
+
+ len = buf->page_len;
+ ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
+ while (len > 0) {
+ if (!*ppages)
+ *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (!*ppages)
+ return -ENOBUFS;
+ ppages++;
+ len -= PAGE_SIZE;
+ }
+
+ return 0;
+}
+
/* Split @vec on page boundaries into SGEs. FMR registers pages, not
* a byte range. Other modes coalesce these SGEs into a single MR
* when they can.
@@ -233,15 +258,6 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = offset_in_page(xdrbuf->page_base);
while (len) {
- /* ACL likes to be lazy in allocating pages - ACLs
- * are small by default but can get huge.
- */
- if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) {
- if (!*ppages)
- *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
- if (!*ppages)
- return -ENOBUFS;
- }
seg->mr_page = *ppages;
seg->mr_offset = (char *)page_base;
seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
@@ -315,7 +331,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
*mr = rpcrdma_mr_get(r_xprt);
if (!*mr)
goto out_getmr_err;
- trace_xprtrdma_mr_get(req);
(*mr)->mr_req = req;
}
@@ -323,7 +338,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
out_getmr_err:
- trace_xprtrdma_nomrs(req);
+ trace_xprtrdma_nomrs_err(r_xprt, req);
xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
rpcrdma_mrs_refresh(r_xprt);
return ERR_PTR(-EAGAIN);
@@ -867,6 +882,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
__be32 *p;
int ret;
+ if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) {
+ ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf);
+ if (ret)
+ return ret;
+ }
+
rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf),
rqst);
@@ -1322,20 +1343,13 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
p = xdr_inline_decode(xdr, 2 * sizeof(*p));
if (!p)
break;
- dprintk("RPC: %s: server reports "
- "version error (%u-%u), xid %08x\n", __func__,
- be32_to_cpup(p), be32_to_cpu(*(p + 1)),
- be32_to_cpu(rep->rr_xid));
+ trace_xprtrdma_err_vers(rqst, p, p + 1);
break;
case err_chunk:
- dprintk("RPC: %s: server reports "
- "header decoding error, xid %08x\n", __func__,
- be32_to_cpu(rep->rr_xid));
+ trace_xprtrdma_err_chunk(rqst);
break;
default:
- dprintk("RPC: %s: server reports "
- "unrecognized error %d, xid %08x\n", __func__,
- be32_to_cpup(p), be32_to_cpu(rep->rr_xid));
+ trace_xprtrdma_err_unrecognized(rqst, p);
}
return -EIO;
@@ -1376,7 +1390,7 @@ out:
return;
out_badheader:
- trace_xprtrdma_reply_hdr(rep);
+ trace_xprtrdma_reply_hdr_err(rep);
r_xprt->rx_stats.bad_reply_count++;
rqst->rq_task->tk_status = status;
status = 0;
@@ -1450,14 +1464,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
rpcrdma_post_recvs(r_xprt, false);
req = rpcr_to_rdmar(rqst);
- if (req->rl_reply) {
- trace_xprtrdma_leaked_rep(rqst, req->rl_reply);
+ if (unlikely(req->rl_reply))
rpcrdma_recv_buffer_put(req->rl_reply);
- }
req->rl_reply = rep;
rep->rr_rqst = rqst;
- trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
+ trace_xprtrdma_reply(rqst->rq_task, rep, credits);
if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
frwr_reminv(rep, &req->rl_registered);
@@ -1469,16 +1481,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
return;
out_badversion:
- trace_xprtrdma_reply_vers(rep);
+ trace_xprtrdma_reply_vers_err(rep);
goto out;
out_norqst:
spin_unlock(&xprt->queue_lock);
- trace_xprtrdma_reply_rqst(rep);
+ trace_xprtrdma_reply_rqst_err(rep);
goto out;
out_shortreply:
- trace_xprtrdma_reply_short(rep);
+ trace_xprtrdma_reply_short_err(rep);
out:
rpcrdma_recv_buffer_put(rep);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 5e7c4ba9e147..63f8be974df2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -74,11 +74,17 @@ out_unlock:
*/
static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
struct rpc_rqst *rqst,
- struct svc_rdma_send_ctxt *ctxt)
+ struct svc_rdma_send_ctxt *sctxt)
{
+ struct svc_rdma_recv_ctxt *rctxt;
int ret;
- ret = svc_rdma_map_reply_msg(rdma, ctxt, NULL, &rqst->rq_snd_buf);
+ rctxt = svc_rdma_recv_ctxt_get(rdma);
+ if (!rctxt)
+ return -EIO;
+
+ ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf);
+ svc_rdma_recv_ctxt_put(rdma, rctxt);
if (ret < 0)
return -EIO;
@@ -86,8 +92,8 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
* the rq_buffer before all retransmits are complete.
*/
get_page(virt_to_page(rqst->rq_buffer));
- ctxt->sc_send_wr.opcode = IB_WR_SEND;
- return svc_rdma_send(rdma, ctxt);
+ sctxt->sc_send_wr.opcode = IB_WR_SEND;
+ return svc_rdma_send(rdma, sctxt);
}
/* Server-side transport endpoint wants a whole page for its send
diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
new file mode 100644
index 000000000000..b63cfeaa2923
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Oracle. All rights reserved.
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/rpc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+/**
+ * pcl_free - Release all memory associated with a parsed chunk list
+ * @pcl: parsed chunk list
+ *
+ */
+void pcl_free(struct svc_rdma_pcl *pcl)
+{
+ while (!list_empty(&pcl->cl_chunks)) {
+ struct svc_rdma_chunk *chunk;
+
+ chunk = pcl_first_chunk(pcl);
+ list_del(&chunk->ch_list);
+ kfree(chunk);
+ }
+}
+
+static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position)
+{
+ struct svc_rdma_chunk *chunk;
+
+ chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ chunk->ch_position = position;
+ chunk->ch_length = 0;
+ chunk->ch_payload_length = 0;
+ chunk->ch_segcount = 0;
+ return chunk;
+}
+
+static struct svc_rdma_chunk *
+pcl_lookup_position(struct svc_rdma_pcl *pcl, u32 position)
+{
+ struct svc_rdma_chunk *pos;
+
+ pcl_for_each_chunk(pos, pcl) {
+ if (pos->ch_position == position)
+ return pos;
+ }
+ return NULL;
+}
+
+static void pcl_insert_position(struct svc_rdma_pcl *pcl,
+ struct svc_rdma_chunk *chunk)
+{
+ struct svc_rdma_chunk *pos;
+
+ pcl_for_each_chunk(pos, pcl) {
+ if (pos->ch_position > chunk->ch_position)
+ break;
+ }
+ __list_add(&chunk->ch_list, pos->ch_list.prev, &pos->ch_list);
+ pcl->cl_count++;
+}
+
+static void pcl_set_read_segment(const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_chunk *chunk,
+ u32 handle, u32 length, u64 offset)
+{
+ struct svc_rdma_segment *segment;
+
+ segment = &chunk->ch_segments[chunk->ch_segcount];
+ segment->rs_handle = handle;
+ segment->rs_length = length;
+ segment->rs_offset = offset;
+
+ trace_svcrdma_decode_rseg(&rctxt->rc_cid, chunk, segment);
+
+ chunk->ch_length += length;
+ chunk->ch_segcount++;
+}
+
+/**
+ * pcl_alloc_call - Construct a parsed chunk list for the Call body
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
+ *
+ * Assumptions:
+ * - The incoming Read list has already been sanity checked.
+ * - cl_count is already set to the number of segments in
+ * the un-decoded list.
+ * - The list might not be in order by position.
+ *
+ * Return values:
+ * %true: Parsed chunk list was successfully constructed, and
+ * cl_count is updated to be the number of chunks (ie.
+ * unique positions) in the Read list.
+ * %false: Memory allocation failed.
+ */
+bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+ struct svc_rdma_pcl *pcl = &rctxt->rc_call_pcl;
+ unsigned int i, segcount = pcl->cl_count;
+
+ pcl->cl_count = 0;
+ for (i = 0; i < segcount; i++) {
+ struct svc_rdma_chunk *chunk;
+ u32 position, handle, length;
+ u64 offset;
+
+ p++; /* skip the list discriminator */
+ p = xdr_decode_read_segment(p, &position, &handle,
+ &length, &offset);
+ if (position != 0)
+ continue;
+
+ if (pcl_is_empty(pcl)) {
+ chunk = pcl_alloc_chunk(segcount, position);
+ if (!chunk)
+ return false;
+ pcl_insert_position(pcl, chunk);
+ } else {
+ chunk = list_first_entry(&pcl->cl_chunks,
+ struct svc_rdma_chunk,
+ ch_list);
+ }
+
+ pcl_set_read_segment(rctxt, chunk, handle, length, offset);
+ }
+
+ return true;
+}
+
+/**
+ * pcl_alloc_read - Construct a parsed chunk list for normal Read chunks
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
+ *
+ * Assumptions:
+ * - The incoming Read list has already been sanity checked.
+ * - cl_count is already set to the number of segments in
+ * the un-decoded list.
+ * - The list might not be in order by position.
+ *
+ * Return values:
+ * %true: Parsed chunk list was successfully constructed, and
+ * cl_count is updated to be the number of chunks (ie.
+ * unique position values) in the Read list.
+ * %false: Memory allocation failed.
+ *
+ * TODO:
+ * - Check for chunk range overlaps
+ */
+bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+ struct svc_rdma_pcl *pcl = &rctxt->rc_read_pcl;
+ unsigned int i, segcount = pcl->cl_count;
+
+ pcl->cl_count = 0;
+ for (i = 0; i < segcount; i++) {
+ struct svc_rdma_chunk *chunk;
+ u32 position, handle, length;
+ u64 offset;
+
+ p++; /* skip the list discriminator */
+ p = xdr_decode_read_segment(p, &position, &handle,
+ &length, &offset);
+ if (position == 0)
+ continue;
+
+ chunk = pcl_lookup_position(pcl, position);
+ if (!chunk) {
+ chunk = pcl_alloc_chunk(segcount, position);
+ if (!chunk)
+ return false;
+ pcl_insert_position(pcl, chunk);
+ }
+
+ pcl_set_read_segment(rctxt, chunk, handle, length, offset);
+ }
+
+ return true;
+}
+
+/**
+ * pcl_alloc_write - Construct a parsed chunk list from a Write list
+ * @rctxt: Ingress receive context
+ * @pcl: Parsed chunk list to populate
+ * @p: Start of an un-decoded Write list
+ *
+ * Assumptions:
+ * - The incoming Write list has already been sanity checked, and
+ * - cl_count is set to the number of chunks in the un-decoded list.
+ *
+ * Return values:
+ * %true: Parsed chunk list was successfully constructed.
+ * %false: Memory allocation failed.
+ */
+bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_pcl *pcl, __be32 *p)
+{
+ struct svc_rdma_segment *segment;
+ struct svc_rdma_chunk *chunk;
+ unsigned int i, j;
+ u32 segcount;
+
+ for (i = 0; i < pcl->cl_count; i++) {
+ p++; /* skip the list discriminator */
+ segcount = be32_to_cpup(p++);
+
+ chunk = pcl_alloc_chunk(segcount, 0);
+ if (!chunk)
+ return false;
+ list_add_tail(&chunk->ch_list, &pcl->cl_chunks);
+
+ for (j = 0; j < segcount; j++) {
+ segment = &chunk->ch_segments[j];
+ p = xdr_decode_rdma_segment(p, &segment->rs_handle,
+ &segment->rs_length,
+ &segment->rs_offset);
+ trace_svcrdma_decode_wseg(&rctxt->rc_cid, chunk, j);
+
+ chunk->ch_length += segment->rs_length;
+ chunk->ch_segcount++;
+ }
+ }
+ return true;
+}
+
+static int pcl_process_region(const struct xdr_buf *xdr,
+ unsigned int offset, unsigned int length,
+ int (*actor)(const struct xdr_buf *, void *),
+ void *data)
+{
+ struct xdr_buf subbuf;
+
+ if (!length)
+ return 0;
+ if (xdr_buf_subsegment(xdr, &subbuf, offset, length))
+ return -EMSGSIZE;
+ return actor(&subbuf, data);
+}
+
+/**
+ * pcl_process_nonpayloads - Process non-payload regions inside @xdr
+ * @pcl: Chunk list to process
+ * @xdr: xdr_buf to process
+ * @actor: Function to invoke on each non-payload region
+ * @data: Arguments for @actor
+ *
+ * This mechanism must ignore not only result payloads that were already
+ * sent via RDMA Write, but also XDR padding for those payloads that
+ * the upper layer has added.
+ *
+ * Assumptions:
+ * The xdr->len and ch_position fields are aligned to 4-byte multiples.
+ *
+ * Returns:
+ * On success, zero,
+ * %-EMSGSIZE on XDR buffer overflow, or
+ * The return value of @actor
+ */
+int pcl_process_nonpayloads(const struct svc_rdma_pcl *pcl,
+ const struct xdr_buf *xdr,
+ int (*actor)(const struct xdr_buf *, void *),
+ void *data)
+{
+ struct svc_rdma_chunk *chunk, *next;
+ unsigned int start;
+ int ret;
+
+ chunk = pcl_first_chunk(pcl);
+
+ /* No result payloads were generated */
+ if (!chunk || !chunk->ch_payload_length)
+ return actor(xdr, data);
+
+ /* Process the region before the first result payload */
+ ret = pcl_process_region(xdr, 0, chunk->ch_position, actor, data);
+ if (ret < 0)
+ return ret;
+
+ /* Process the regions between each middle result payload */
+ while ((next = pcl_next_chunk(pcl, chunk))) {
+ if (!next->ch_payload_length)
+ break;
+
+ start = pcl_chunk_end_offset(chunk);
+ ret = pcl_process_region(xdr, start, next->ch_position - start,
+ actor, data);
+ if (ret < 0)
+ return ret;
+
+ chunk = next;
+ }
+
+ /* Process the region after the last result payload */
+ start = pcl_chunk_end_offset(chunk);
+ ret = pcl_process_region(xdr, start, xdr->len - start, actor, data);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index c6ea2903c21a..cbdb71247755 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -93,6 +93,7 @@
* (see rdma_read_complete() below).
*/
+#include <linux/slab.h>
#include <linux/spinlock.h>
#include <asm/unaligned.h>
#include <rdma/ib_verbs.h>
@@ -143,6 +144,10 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
goto fail2;
svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
+ pcl_init(&ctxt->rc_call_pcl);
+ pcl_init(&ctxt->rc_read_pcl);
+ pcl_init(&ctxt->rc_write_pcl);
+ pcl_init(&ctxt->rc_reply_pcl);
ctxt->rc_recv_wr.next = NULL;
ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
@@ -189,8 +194,13 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
}
}
-static struct svc_rdma_recv_ctxt *
-svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
+/**
+ * svc_rdma_recv_ctxt_get - Allocate a recv_ctxt
+ * @rdma: controlling svcxprt_rdma
+ *
+ * Returns a recv_ctxt or (rarely) NULL if none are available.
+ */
+struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
struct llist_node *node;
@@ -202,7 +212,6 @@ svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
out:
ctxt->rc_page_count = 0;
- ctxt->rc_read_payload_length = 0;
return ctxt;
out_empty:
@@ -226,6 +235,11 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
for (i = 0; i < ctxt->rc_page_count; i++)
put_page(ctxt->rc_pages[i]);
+ pcl_free(&ctxt->rc_call_pcl);
+ pcl_free(&ctxt->rc_read_pcl);
+ pcl_free(&ctxt->rc_write_pcl);
+ pcl_free(&ctxt->rc_reply_pcl);
+
if (!ctxt->rc_temp)
llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
else
@@ -385,100 +399,123 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
arg->len = ctxt->rc_byte_len;
}
-/* This accommodates the largest possible Write chunk.
- */
-#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
-
-/* This accommodates the largest possible Position-Zero
- * Read chunk or Reply chunk.
- */
-#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
-
-/* Sanity check the Read list.
- *
- * Implementation limits:
- * - This implementation supports only one Read chunk.
+/**
+ * xdr_count_read_segments - Count number of Read segments in Read list
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
*
- * Sanity checks:
- * - Read list does not overflow Receive buffer.
- * - Segment size limited by largest NFS data payload.
+ * Before allocating anything, ensure the ingress Read list is safe
+ * to use.
*
- * The segment count is limited to how many segments can
- * fit in the transport header without overflowing the
- * buffer. That's about 40 Read segments for a 1KB inline
- * threshold.
+ * The segment count is limited to how many segments can fit in the
+ * transport header without overflowing the buffer. That's about 40
+ * Read segments for a 1KB inline threshold.
*
* Return values:
- * %true: Read list is valid. @rctxt's xdr_stream is updated
- * to point to the first byte past the Read list.
- * %false: Read list is corrupt. @rctxt's xdr_stream is left
- * in an unknown state.
+ * %true: Read list is valid. @rctxt's xdr_stream is updated to point
+ * to the first byte past the Read list. rc_read_pcl and
+ * rc_call_pcl cl_count fields are set to the number of
+ * Read segments in the list.
+ * %false: Read list is corrupt. @rctxt's xdr_stream is left in an
+ * unknown state.
*/
-static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
+static bool xdr_count_read_segments(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
{
- u32 position, len;
- bool first;
- __be32 *p;
-
- p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
- if (!p)
- return false;
-
- len = 0;
- first = true;
+ rctxt->rc_call_pcl.cl_count = 0;
+ rctxt->rc_read_pcl.cl_count = 0;
while (xdr_item_is_present(p)) {
+ u32 position, handle, length;
+ u64 offset;
+
p = xdr_inline_decode(&rctxt->rc_stream,
rpcrdma_readseg_maxsz * sizeof(*p));
if (!p)
return false;
- if (first) {
- position = be32_to_cpup(p);
- first = false;
- } else if (be32_to_cpup(p) != position) {
- return false;
+ xdr_decode_read_segment(p, &position, &handle,
+ &length, &offset);
+ if (position) {
+ if (position & 3)
+ return false;
+ ++rctxt->rc_read_pcl.cl_count;
+ } else {
+ ++rctxt->rc_call_pcl.cl_count;
}
- p += 2;
- len += be32_to_cpup(p);
p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
if (!p)
return false;
}
- return len <= MAX_BYTES_SPECIAL_CHUNK;
+ return true;
}
-/* The segment count is limited to how many segments can
- * fit in the transport header without overflowing the
- * buffer. That's about 60 Write segments for a 1KB inline
- * threshold.
+/* Sanity check the Read list.
+ *
+ * Sanity checks:
+ * - Read list does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
+ *
+ * Return values:
+ * %true: Read list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Read list.
+ * %false: Read list is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen)
+static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 i, segcount, total;
__be32 *p;
p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
if (!p)
return false;
- segcount = be32_to_cpup(p);
+ if (!xdr_count_read_segments(rctxt, p))
+ return false;
+ if (!pcl_alloc_call(rctxt, p))
+ return false;
+ return pcl_alloc_read(rctxt, p);
+}
- total = 0;
- for (i = 0; i < segcount; i++) {
- u32 handle, length;
- u64 offset;
+static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt)
+{
+ u32 segcount;
+ __be32 *p;
- p = xdr_inline_decode(&rctxt->rc_stream,
- rpcrdma_segment_maxsz * sizeof(*p));
- if (!p)
- return false;
+ if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount))
+ return false;
- xdr_decode_rdma_segment(p, &handle, &length, &offset);
- trace_svcrdma_decode_wseg(handle, length, offset);
+ /* A bogus segcount causes this buffer overflow check to fail. */
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ segcount * rpcrdma_segment_maxsz * sizeof(*p));
+ return p != NULL;
+}
- total += length;
+/**
+ * xdr_count_write_chunks - Count number of Write chunks in Write list
+ * @rctxt: Received header and decoding state
+ * @p: start of an un-decoded Write list
+ *
+ * Before allocating anything, ensure the ingress Write list is
+ * safe to use.
+ *
+ * Return values:
+ * %true: Write list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Write list, and
+ * the number of Write chunks is in rc_write_pcl.cl_count.
+ * %false: Write list is corrupt. @rctxt's xdr_stream is left
+ * in an indeterminate state.
+ */
+static bool xdr_count_write_chunks(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+ rctxt->rc_write_pcl.cl_count = 0;
+ while (xdr_item_is_present(p)) {
+ if (!xdr_check_write_chunk(rctxt))
+ return false;
+ ++rctxt->rc_write_pcl.cl_count;
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
}
- return total <= maxlen;
+ return true;
}
/* Sanity check the Write list.
@@ -498,24 +535,18 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen)
*/
static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 chcount = 0;
__be32 *p;
p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
if (!p)
return false;
- rctxt->rc_write_list = p;
- while (xdr_item_is_present(p)) {
- if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK))
- return false;
- ++chcount;
- p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
- if (!p)
- return false;
- }
- if (!chcount)
- rctxt->rc_write_list = NULL;
- return chcount < 2;
+ if (!xdr_count_write_chunks(rctxt, p))
+ return false;
+ if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p))
+ return false;
+
+ rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl);
+ return true;
}
/* Sanity check the Reply chunk.
@@ -537,13 +568,14 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
if (!p)
return false;
- rctxt->rc_reply_chunk = NULL;
- if (xdr_item_is_present(p)) {
- if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK))
- return false;
- rctxt->rc_reply_chunk = p;
- }
- return true;
+
+ if (!xdr_item_is_present(p))
+ return true;
+ if (!xdr_check_write_chunk(rctxt))
+ return false;
+
+ rctxt->rc_reply_pcl.cl_count = 1;
+ return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p);
}
/* RPC-over-RDMA Version One private extension: Remote Invalidation.
@@ -552,60 +584,53 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
*
* If there is exactly one distinct R_key in the received transport
* header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
- *
- * Perform this operation while the received transport header is
- * still in the CPU cache.
*/
static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
- __be32 inv_rkey, *p;
- u32 i, segcount;
+ struct svc_rdma_segment *segment;
+ struct svc_rdma_chunk *chunk;
+ u32 inv_rkey;
ctxt->rc_inv_rkey = 0;
if (!rdma->sc_snd_w_inv)
return;
- inv_rkey = xdr_zero;
- p = ctxt->rc_recv_buf;
- p += rpcrdma_fixed_maxsz;
-
- /* Read list */
- while (xdr_item_is_present(p++)) {
- p++; /* position */
- if (inv_rkey == xdr_zero)
- inv_rkey = *p;
- else if (inv_rkey != *p)
- return;
- p += 4;
+ inv_rkey = 0;
+ pcl_for_each_chunk(chunk, &ctxt->rc_call_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
+ return;
+ }
}
-
- /* Write list */
- while (xdr_item_is_present(p++)) {
- segcount = be32_to_cpup(p++);
- for (i = 0; i < segcount; i++) {
- if (inv_rkey == xdr_zero)
- inv_rkey = *p;
- else if (inv_rkey != *p)
+ pcl_for_each_chunk(chunk, &ctxt->rc_read_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
return;
- p += 4;
}
}
-
- /* Reply chunk */
- if (xdr_item_is_present(p++)) {
- segcount = be32_to_cpup(p++);
- for (i = 0; i < segcount; i++) {
- if (inv_rkey == xdr_zero)
- inv_rkey = *p;
- else if (inv_rkey != *p)
+ pcl_for_each_chunk(chunk, &ctxt->rc_write_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
return;
- p += 4;
}
}
-
- ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
+ pcl_for_each_chunk(chunk, &ctxt->rc_reply_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
+ return;
+ }
+ }
+ ctxt->rc_inv_rkey = inv_rkey;
}
/**
@@ -641,7 +666,8 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg,
if (*p != rpcrdma_version)
goto out_version;
p += 2;
- switch (*p) {
+ rctxt->rc_msgtype = *p;
+ switch (rctxt->rc_msgtype) {
case rdma_msg:
break;
case rdma_nomsg:
@@ -735,30 +761,28 @@ static void svc_rdma_send_error(struct svcxprt_rdma *rdma,
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
-static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
- __be32 *rdma_resp)
+static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
+ struct svc_rdma_recv_ctxt *rctxt)
{
- __be32 *p;
+ __be32 *p = rctxt->rc_recv_buf;
if (!xprt->xpt_bc_xprt)
return false;
- p = rdma_resp + 3;
- if (*p++ != rdma_msg)
+ if (rctxt->rc_msgtype != rdma_msg)
return false;
- if (*p++ != xdr_zero)
+ if (!pcl_is_empty(&rctxt->rc_call_pcl))
return false;
- if (*p++ != xdr_zero)
+ if (!pcl_is_empty(&rctxt->rc_read_pcl))
return false;
- if (*p++ != xdr_zero)
+ if (!pcl_is_empty(&rctxt->rc_write_pcl))
return false;
-
- /* XID sanity */
- if (*p++ != *rdma_resp)
+ if (!pcl_is_empty(&rctxt->rc_reply_pcl))
return false;
- /* call direction */
- if (*p == cpu_to_be32(RPC_CALL))
+
+ /* RPC call direction */
+ if (*(p + 8) == cpu_to_be32(RPC_CALL))
return false;
return true;
@@ -800,7 +824,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
struct svcxprt_rdma *rdma_xprt =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct svc_rdma_recv_ctxt *ctxt;
- __be32 *p;
int ret;
rqstp->rq_xprt_ctxt = NULL;
@@ -833,7 +856,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_respages = rqstp->rq_pages;
rqstp->rq_next_page = rqstp->rq_respages;
- p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
if (ret < 0)
goto out_err;
@@ -841,14 +863,14 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto out_drop;
rqstp->rq_xprt_hlen = ret;
- if (svc_rdma_is_backchannel_reply(xprt, p))
+ if (svc_rdma_is_reverse_direction_reply(xprt, ctxt))
goto out_backchannel;
svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
- p += rpcrdma_fixed_maxsz;
- if (*p != xdr_zero)
- goto out_readchunk;
+ if (!pcl_is_empty(&ctxt->rc_read_pcl) ||
+ !pcl_is_empty(&ctxt->rc_call_pcl))
+ goto out_readlist;
complete:
rqstp->rq_xprt_ctxt = ctxt;
@@ -856,10 +878,10 @@ complete:
svc_xprt_copy_addrs(rqstp, xprt);
return rqstp->rq_arg.len;
-out_readchunk:
- ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p);
+out_readlist:
+ ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
if (ret < 0)
- goto out_postfail;
+ goto out_readfail;
return 0;
out_err:
@@ -867,7 +889,7 @@ out_err:
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
return 0;
-out_postfail:
+out_readfail:
if (ret == -EINVAL)
svc_rdma_send_error(rdma_xprt, ctxt, ret);
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index fe54cbe97a46..0b63e1321d74 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -137,7 +137,7 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
}
/* A chunk context tracks all I/O for moving one Read or Write
- * chunk. This is a a set of rdma_rw's that handle data movement
+ * chunk. This is a set of rdma_rw's that handle data movement
* for all segments of one chunk.
*
* These are small, acquired with a single allocator call, and
@@ -190,14 +190,14 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
* - Stores arguments for the SGL constructor functions
*/
struct svc_rdma_write_info {
+ const struct svc_rdma_chunk *wi_chunk;
+
/* write state of this chunk */
unsigned int wi_seg_off;
unsigned int wi_seg_no;
- unsigned int wi_nsegs;
- __be32 *wi_segs;
/* SGL constructor arguments */
- struct xdr_buf *wi_xdr;
+ const struct xdr_buf *wi_xdr;
unsigned char *wi_base;
unsigned int wi_next_off;
@@ -205,7 +205,8 @@ struct svc_rdma_write_info {
};
static struct svc_rdma_write_info *
-svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_chunk *chunk)
{
struct svc_rdma_write_info *info;
@@ -213,10 +214,9 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
if (!info)
return info;
+ info->wi_chunk = chunk;
info->wi_seg_off = 0;
info->wi_seg_no = 0;
- info->wi_nsegs = be32_to_cpup(++chunk);
- info->wi_segs = ++chunk;
svc_rdma_cc_init(rdma, &info->wi_cc);
info->wi_cc.cc_cqe.done = svc_rdma_write_done;
return info;
@@ -258,11 +258,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
/* State for pulling a Read chunk.
*/
struct svc_rdma_read_info {
+ struct svc_rqst *ri_rqst;
struct svc_rdma_recv_ctxt *ri_readctxt;
- unsigned int ri_position;
unsigned int ri_pageno;
unsigned int ri_pageoff;
- unsigned int ri_chunklen;
+ unsigned int ri_totalbytes;
struct svc_rdma_chunk_ctxt ri_cc;
};
@@ -358,7 +358,6 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
do {
if (atomic_sub_return(cc->cc_sqecount,
&rdma->sc_sq_avail) > 0) {
- trace_svcrdma_post_chunk(&cc->cc_cid, cc->cc_sqecount);
ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
if (ret)
break;
@@ -405,7 +404,7 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
struct svc_rdma_rw_ctxt *ctxt)
{
unsigned int sge_no, sge_bytes, page_off, page_no;
- struct xdr_buf *xdr = info->wi_xdr;
+ const struct xdr_buf *xdr = info->wi_xdr;
struct scatterlist *sg;
struct page **page;
@@ -443,40 +442,36 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
{
struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
struct svcxprt_rdma *rdma = cc->cc_rdma;
+ const struct svc_rdma_segment *seg;
struct svc_rdma_rw_ctxt *ctxt;
- __be32 *seg;
int ret;
- seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
do {
unsigned int write_len;
- u32 handle, length;
u64 offset;
- if (info->wi_seg_no >= info->wi_nsegs)
+ seg = &info->wi_chunk->ch_segments[info->wi_seg_no];
+ if (!seg)
goto out_overflow;
- xdr_decode_rdma_segment(seg, &handle, &length, &offset);
- offset += info->wi_seg_off;
-
- write_len = min(remaining, length - info->wi_seg_off);
+ write_len = min(remaining, seg->rs_length - info->wi_seg_off);
+ if (!write_len)
+ goto out_overflow;
ctxt = svc_rdma_get_rw_ctxt(rdma,
(write_len >> PAGE_SHIFT) + 2);
if (!ctxt)
return -ENOMEM;
constructor(info, write_len, ctxt);
- ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, handle,
+ offset = seg->rs_offset + info->wi_seg_off;
+ ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle,
DMA_TO_DEVICE);
if (ret < 0)
return -EIO;
- trace_svcrdma_send_wseg(handle, write_len, offset);
-
list_add(&ctxt->rw_list, &cc->cc_rwctxts);
cc->cc_sqecount += ret;
- if (write_len == length - info->wi_seg_off) {
- seg += 4;
+ if (write_len == seg->rs_length - info->wi_seg_off) {
info->wi_seg_no++;
info->wi_seg_off = 0;
} else {
@@ -489,31 +484,46 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
out_overflow:
trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no,
- info->wi_nsegs);
+ info->wi_chunk->ch_segcount);
return -E2BIG;
}
-/* Send one of an xdr_buf's kvecs by itself. To send a Reply
- * chunk, the whole RPC Reply is written back to the client.
- * This function writes either the head or tail of the xdr_buf
- * containing the Reply.
+/**
+ * svc_rdma_iov_write - Construct RDMA Writes from an iov
+ * @info: pointer to write arguments
+ * @iov: kvec to write
+ *
+ * Returns:
+ * On succes, returns zero
+ * %-E2BIG if the client-provided Write chunk is too small
+ * %-ENOMEM if a resource has been exhausted
+ * %-EIO if an rdma-rw error occurred
*/
-static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
- struct kvec *vec)
+static int svc_rdma_iov_write(struct svc_rdma_write_info *info,
+ const struct kvec *iov)
{
- info->wi_base = vec->iov_base;
+ info->wi_base = iov->iov_base;
return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
- vec->iov_len);
+ iov->iov_len);
}
-/* Send an xdr_buf's page list by itself. A Write chunk is just
- * the page list. A Reply chunk is @xdr's head, page list, and
- * tail. This function is shared between the two types of chunk.
+/**
+ * svc_rdma_pages_write - Construct RDMA Writes from pages
+ * @info: pointer to write arguments
+ * @xdr: xdr_buf with pages to write
+ * @offset: offset into the content of @xdr
+ * @length: number of bytes to write
+ *
+ * Returns:
+ * On succes, returns zero
+ * %-E2BIG if the client-provided Write chunk is too small
+ * %-ENOMEM if a resource has been exhausted
+ * %-EIO if an rdma-rw error occurred
*/
-static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
- struct xdr_buf *xdr,
- unsigned int offset,
- unsigned long length)
+static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
+ const struct xdr_buf *xdr,
+ unsigned int offset,
+ unsigned long length)
{
info->wi_xdr = xdr;
info->wi_next_off = offset - xdr->head[0].iov_len;
@@ -522,12 +532,48 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
}
/**
+ * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf
+ * @xdr: xdr_buf to write
+ * @data: pointer to write arguments
+ *
+ * Returns:
+ * On succes, returns zero
+ * %-E2BIG if the client-provided Write chunk is too small
+ * %-ENOMEM if a resource has been exhausted
+ * %-EIO if an rdma-rw error occurred
+ */
+static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
+{
+ struct svc_rdma_write_info *info = data;
+ int ret;
+
+ if (xdr->head[0].iov_len) {
+ ret = svc_rdma_iov_write(info, &xdr->head[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (xdr->page_len) {
+ ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len,
+ xdr->page_len);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (xdr->tail[0].iov_len) {
+ ret = svc_rdma_iov_write(info, &xdr->tail[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ return xdr->len;
+}
+
+/**
* svc_rdma_send_write_chunk - Write all segments in a Write chunk
* @rdma: controlling RDMA transport
- * @wr_ch: Write chunk provided by client
+ * @chunk: Write chunk provided by the client
* @xdr: xdr_buf containing the data payload
- * @offset: payload's byte offset in @xdr
- * @length: size of payload, in bytes
*
* Returns a non-negative number of bytes the chunk consumed, or
* %-E2BIG if the payload was larger than the Write chunk,
@@ -536,30 +582,28 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
* %-ENOTCONN if posting failed (connection is lost),
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
*/
-int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
- struct xdr_buf *xdr,
- unsigned int offset, unsigned long length)
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_chunk *chunk,
+ const struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
+ struct svc_rdma_chunk_ctxt *cc;
int ret;
- if (!length)
- return 0;
-
- info = svc_rdma_write_info_alloc(rdma, wr_ch);
+ info = svc_rdma_write_info_alloc(rdma, chunk);
if (!info)
return -ENOMEM;
+ cc = &info->wi_cc;
- ret = svc_rdma_send_xdr_pagelist(info, xdr, offset, length);
- if (ret < 0)
+ ret = svc_rdma_xb_write(xdr, info);
+ if (ret != xdr->len)
goto out_err;
- ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
+ ret = svc_rdma_post_chunk_ctxt(cc);
if (ret < 0)
goto out_err;
-
- trace_svcrdma_send_write_chunk(xdr->page_len);
- return length;
+ return xdr->len;
out_err:
svc_rdma_write_info_free(info);
@@ -581,62 +625,62 @@ out_err:
*/
int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
const struct svc_rdma_recv_ctxt *rctxt,
- struct xdr_buf *xdr)
+ const struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
- int consumed, ret;
+ struct svc_rdma_chunk_ctxt *cc;
+ struct svc_rdma_chunk *chunk;
+ int ret;
- info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk);
+ if (pcl_is_empty(&rctxt->rc_reply_pcl))
+ return 0;
+
+ chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
+ info = svc_rdma_write_info_alloc(rdma, chunk);
if (!info)
return -ENOMEM;
+ cc = &info->wi_cc;
- ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+ ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_write, info);
if (ret < 0)
goto out_err;
- consumed = xdr->head[0].iov_len;
-
- /* Send the page list in the Reply chunk only if the
- * client did not provide Write chunks.
- */
- if (!rctxt->rc_write_list && xdr->page_len) {
- ret = svc_rdma_send_xdr_pagelist(info, xdr,
- xdr->head[0].iov_len,
- xdr->page_len);
- if (ret < 0)
- goto out_err;
- consumed += xdr->page_len;
- }
-
- if (xdr->tail[0].iov_len) {
- ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
- if (ret < 0)
- goto out_err;
- consumed += xdr->tail[0].iov_len;
- }
- ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
+ ret = svc_rdma_post_chunk_ctxt(cc);
if (ret < 0)
goto out_err;
- trace_svcrdma_send_reply_chunk(consumed);
- return consumed;
+ return xdr->len;
out_err:
svc_rdma_write_info_free(info);
return ret;
}
+/**
+ * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment
+ * @info: context for ongoing I/O
+ * @segment: co-ordinates of remote memory to be read
+ *
+ * Returns:
+ * %0: the Read WR chain was constructed successfully
+ * %-EINVAL: there were not enough rq_pages to finish
+ * %-ENOMEM: allocating a local resources failed
+ * %-EIO: a DMA mapping error occurred
+ */
static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
- struct svc_rqst *rqstp,
- u32 rkey, u32 len, u64 offset)
+ const struct svc_rdma_segment *segment)
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
+ struct svc_rqst *rqstp = info->ri_rqst;
struct svc_rdma_rw_ctxt *ctxt;
- unsigned int sge_no, seg_len;
+ unsigned int sge_no, seg_len, len;
struct scatterlist *sg;
int ret;
+ len = segment->rs_length;
sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT;
ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no);
if (!ctxt)
@@ -670,8 +714,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
goto out_overrun;
}
- ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, offset, rkey,
- DMA_FROM_DEVICE);
+ ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset,
+ segment->rs_handle, DMA_FROM_DEVICE);
if (ret < 0)
return -EIO;
@@ -684,54 +728,177 @@ out_overrun:
return -EINVAL;
}
-/* Walk the segments in the Read chunk starting at @p and construct
- * RDMA Read operations to pull the chunk to the server.
+/**
+ * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk
+ * @info: context for ongoing I/O
+ * @chunk: Read chunk to pull
+ *
+ * Return values:
+ * %0: the Read WR chain was constructed successfully
+ * %-EINVAL: there were not enough resources to finish
+ * %-ENOMEM: allocating a local resources failed
+ * %-EIO: a DMA mapping error occurred
*/
-static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
- struct svc_rdma_read_info *info,
- __be32 *p)
+static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info,
+ const struct svc_rdma_chunk *chunk)
{
+ const struct svc_rdma_segment *segment;
int ret;
ret = -EINVAL;
- info->ri_chunklen = 0;
- while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) {
- u32 handle, length;
- u64 offset;
+ pcl_for_each_segment(segment, chunk) {
+ ret = svc_rdma_build_read_segment(info, segment);
+ if (ret < 0)
+ break;
+ info->ri_totalbytes += segment->rs_length;
+ }
+ return ret;
+}
+
+/**
+ * svc_rdma_copy_inline_range - Copy part of the inline content into pages
+ * @info: context for RDMA Reads
+ * @offset: offset into the Receive buffer of region to copy
+ * @remaining: length of region to copy
+ *
+ * Take a page at a time from rqstp->rq_pages and copy the inline
+ * content from the Receive buffer into that page. Update
+ * info->ri_pageno and info->ri_pageoff so that the next RDMA Read
+ * result will land contiguously with the copied content.
+ *
+ * Return values:
+ * %0: Inline content was successfully copied
+ * %-EINVAL: offset or length was incorrect
+ */
+static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
+ unsigned int offset,
+ unsigned int remaining)
+{
+ struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ unsigned char *dst, *src = head->rc_recv_buf;
+ struct svc_rqst *rqstp = info->ri_rqst;
+ unsigned int page_no, numpages;
+
+ numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT;
+ for (page_no = 0; page_no < numpages; page_no++) {
+ unsigned int page_len;
+
+ page_len = min_t(unsigned int, remaining,
+ PAGE_SIZE - info->ri_pageoff);
+
+ head->rc_arg.pages[info->ri_pageno] =
+ rqstp->rq_pages[info->ri_pageno];
+ if (!info->ri_pageoff)
+ head->rc_page_count++;
+
+ dst = page_address(head->rc_arg.pages[info->ri_pageno]);
+ memcpy(dst + info->ri_pageno, src + offset, page_len);
+
+ info->ri_totalbytes += page_len;
+ info->ri_pageoff += page_len;
+ if (info->ri_pageoff == PAGE_SIZE) {
+ info->ri_pageno++;
+ info->ri_pageoff = 0;
+ }
+ remaining -= page_len;
+ offset += page_len;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks
+ * @info: context for RDMA Reads
+ *
+ * The chunk data lands in head->rc_arg as a series of contiguous pages,
+ * like an incoming TCP call.
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
+ */
+static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *info)
+{
+ struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
+ struct svc_rdma_chunk *chunk, *next;
+ struct xdr_buf *buf = &head->rc_arg;
+ unsigned int start, length;
+ int ret;
- p = xdr_decode_rdma_segment(p, &handle, &length, &offset);
- ret = svc_rdma_build_read_segment(info, rqstp, handle, length,
- offset);
+ start = 0;
+ chunk = pcl_first_chunk(pcl);
+ length = chunk->ch_position;
+ ret = svc_rdma_copy_inline_range(info, start, length);
+ if (ret < 0)
+ return ret;
+
+ pcl_for_each_chunk(chunk, pcl) {
+ ret = svc_rdma_build_read_chunk(info, chunk);
if (ret < 0)
+ return ret;
+
+ next = pcl_next_chunk(pcl, chunk);
+ if (!next)
break;
- trace_svcrdma_send_rseg(handle, length, offset);
- info->ri_chunklen += length;
+ start += length;
+ length = next->ch_position - info->ri_totalbytes;
+ ret = svc_rdma_copy_inline_range(info, start, length);
+ if (ret < 0)
+ return ret;
}
- return ret;
+ start += length;
+ length = head->rc_byte_len - start;
+ ret = svc_rdma_copy_inline_range(info, start, length);
+ if (ret < 0)
+ return ret;
+
+ buf->len += info->ri_totalbytes;
+ buf->buflen += info->ri_totalbytes;
+
+ head->rc_hdr_count = 1;
+ buf->head[0].iov_base = page_address(head->rc_pages[0]);
+ buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
+ buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
+ return 0;
}
-/* Construct RDMA Reads to pull over a normal Read chunk. The chunk
- * data lands in the page list of head->rc_arg.pages.
+/**
+ * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks
+ * @info: context for RDMA Reads
+ *
+ * The chunk data lands in the page list of head->rc_arg.pages.
*
* Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
* Therefore, XDR round-up of the Read chunk and trailing
* inline content must both be added at the end of the pagelist.
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
*/
-static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
- struct svc_rdma_read_info *info,
- __be32 *p)
+static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ struct xdr_buf *buf = &head->rc_arg;
+ struct svc_rdma_chunk *chunk;
+ unsigned int length;
int ret;
- ret = svc_rdma_build_read_chunk(rqstp, info, p);
+ chunk = pcl_first_chunk(&head->rc_read_pcl);
+ ret = svc_rdma_build_read_chunk(info, chunk);
if (ret < 0)
goto out;
- trace_svcrdma_send_read_chunk(info->ri_chunklen, info->ri_position);
-
head->rc_hdr_count = 0;
/* Split the Receive buffer between the head and tail
@@ -739,11 +906,9 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
* chunk is not included in either the pagelist or in
* the tail.
*/
- head->rc_arg.tail[0].iov_base =
- head->rc_arg.head[0].iov_base + info->ri_position;
- head->rc_arg.tail[0].iov_len =
- head->rc_arg.head[0].iov_len - info->ri_position;
- head->rc_arg.head[0].iov_len = info->ri_position;
+ buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position;
+ buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position;
+ buf->head[0].iov_len = chunk->ch_position;
/* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2).
*
@@ -754,50 +919,149 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
* Currently these chunks always start at page offset 0,
* thus the rounded-up length never crosses a page boundary.
*/
- info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2;
-
- head->rc_arg.page_len = info->ri_chunklen;
- head->rc_arg.len += info->ri_chunklen;
- head->rc_arg.buflen += info->ri_chunklen;
+ length = XDR_QUADLEN(info->ri_totalbytes) << 2;
+ buf->page_len = length;
+ buf->len += length;
+ buf->buflen += length;
out:
return ret;
}
-/* Construct RDMA Reads to pull over a Position Zero Read chunk.
- * The start of the data lands in the first page just after
- * the Transport header, and the rest lands in the page list of
+/**
+ * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk
+ * @info: context for RDMA Reads
+ * @chunk: parsed Call chunk to pull
+ * @offset: offset of region to pull
+ * @length: length of region to pull
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: there were not enough resources to finish
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
+ */
+static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info,
+ const struct svc_rdma_chunk *chunk,
+ unsigned int offset, unsigned int length)
+{
+ const struct svc_rdma_segment *segment;
+ int ret;
+
+ ret = -EINVAL;
+ pcl_for_each_segment(segment, chunk) {
+ struct svc_rdma_segment dummy;
+
+ if (offset > segment->rs_length) {
+ offset -= segment->rs_length;
+ continue;
+ }
+
+ dummy.rs_handle = segment->rs_handle;
+ dummy.rs_length = min_t(u32, length, segment->rs_length) - offset;
+ dummy.rs_offset = segment->rs_offset + offset;
+
+ ret = svc_rdma_build_read_segment(info, &dummy);
+ if (ret < 0)
+ break;
+
+ info->ri_totalbytes += dummy.rs_length;
+ length -= dummy.rs_length;
+ offset = 0;
+ }
+ return ret;
+}
+
+/**
+ * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message
+ * @info: context for RDMA Reads
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: there were not enough resources to finish
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
+ */
+static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
+{
+ struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ const struct svc_rdma_chunk *call_chunk =
+ pcl_first_chunk(&head->rc_call_pcl);
+ const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
+ struct svc_rdma_chunk *chunk, *next;
+ unsigned int start, length;
+ int ret;
+
+ if (pcl_is_empty(pcl))
+ return svc_rdma_build_read_chunk(info, call_chunk);
+
+ start = 0;
+ chunk = pcl_first_chunk(pcl);
+ length = chunk->ch_position;
+ ret = svc_rdma_read_chunk_range(info, call_chunk, start, length);
+ if (ret < 0)
+ return ret;
+
+ pcl_for_each_chunk(chunk, pcl) {
+ ret = svc_rdma_build_read_chunk(info, chunk);
+ if (ret < 0)
+ return ret;
+
+ next = pcl_next_chunk(pcl, chunk);
+ if (!next)
+ break;
+
+ start += length;
+ length = next->ch_position - info->ri_totalbytes;
+ ret = svc_rdma_read_chunk_range(info, call_chunk,
+ start, length);
+ if (ret < 0)
+ return ret;
+ }
+
+ start += length;
+ length = call_chunk->ch_length - start;
+ return svc_rdma_read_chunk_range(info, call_chunk, start, length);
+}
+
+/**
+ * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message
+ * @info: context for RDMA Reads
+ *
+ * The start of the data lands in the first page just after the
+ * Transport header, and the rest lands in the page list of
* head->rc_arg.pages.
*
* Assumptions:
- * - A PZRC has an XDR-aligned length (no implicit round-up).
- * - There can be no trailing inline content (IOW, we assume
- * a PZRC is never sent in an RDMA_MSG message, though it's
- * allowed by spec).
+ * - A PZRC is never sent in an RDMA_MSG message, though it's
+ * allowed by spec.
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
*/
-static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
- struct svc_rdma_read_info *info,
- __be32 *p)
+static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ struct xdr_buf *buf = &head->rc_arg;
int ret;
- ret = svc_rdma_build_read_chunk(rqstp, info, p);
+ ret = svc_rdma_read_call_chunk(info);
if (ret < 0)
goto out;
- trace_svcrdma_send_pzr(info->ri_chunklen);
-
- head->rc_arg.len += info->ri_chunklen;
- head->rc_arg.buflen += info->ri_chunklen;
+ buf->len += info->ri_totalbytes;
+ buf->buflen += info->ri_totalbytes;
head->rc_hdr_count = 1;
- head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]);
- head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE,
- info->ri_chunklen);
-
- head->rc_arg.page_len = info->ri_chunklen -
- head->rc_arg.head[0].iov_len;
+ buf->head[0].iov_base = page_address(head->rc_pages[0]);
+ buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
+ buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
out:
return ret;
@@ -824,26 +1088,34 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
}
/**
- * svc_rdma_recv_read_chunk - Pull a Read chunk from the client
+ * svc_rdma_process_read_list - Pull list of Read chunks from the client
* @rdma: controlling RDMA transport
* @rqstp: set of pages to use as Read sink buffers
* @head: pages under I/O collect here
- * @p: pointer to start of Read chunk
*
- * Returns:
- * %0 if all needed RDMA Reads were posted successfully,
- * %-EINVAL if client provided too many segments,
- * %-ENOMEM if rdma_rw context pool was exhausted,
- * %-ENOTCONN if posting failed (connection is lost),
- * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ * The RPC/RDMA protocol assumes that the upper layer's XDR decoders
+ * pull each Read chunk as they decode an incoming RPC message.
*
- * Assumptions:
- * - All Read segments in @p have the same Position value.
+ * On Linux, however, the server needs to have a fully-constructed RPC
+ * message in rqstp->rq_arg when there is a positive return code from
+ * ->xpo_recvfrom. So the Read list is safety-checked immediately when
+ * it is received, then here the whole Read list is pulled all at once.
+ * The ingress RPC message is fully reconstructed once all associated
+ * RDMA Reads have completed.
+ *
+ * Return values:
+ * %1: all needed RDMA Reads were posted successfully,
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
*/
-int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
- struct svc_rdma_recv_ctxt *head, __be32 *p)
+int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_recv_ctxt *head)
{
struct svc_rdma_read_info *info;
+ struct svc_rdma_chunk_ctxt *cc;
int ret;
/* The request (with page list) is constructed in
@@ -861,23 +1133,29 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
info = svc_rdma_read_info_alloc(rdma);
if (!info)
return -ENOMEM;
+ cc = &info->ri_cc;
+ info->ri_rqst = rqstp;
info->ri_readctxt = head;
info->ri_pageno = 0;
info->ri_pageoff = 0;
-
- info->ri_position = be32_to_cpup(p + 1);
- if (info->ri_position)
- ret = svc_rdma_build_normal_read_chunk(rqstp, info, p);
- else
- ret = svc_rdma_build_pz_read_chunk(rqstp, info, p);
+ info->ri_totalbytes = 0;
+
+ if (pcl_is_empty(&head->rc_call_pcl)) {
+ if (head->rc_read_pcl.cl_count == 1)
+ ret = svc_rdma_read_data_item(info);
+ else
+ ret = svc_rdma_read_multiple_chunks(info);
+ } else
+ ret = svc_rdma_read_special(info);
if (ret < 0)
goto out_err;
- ret = svc_rdma_post_chunk_ctxt(&info->ri_cc);
+ trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
+ ret = svc_rdma_post_chunk_ctxt(cc);
if (ret < 0)
goto out_err;
svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
- return 0;
+ return 1;
out_err:
svc_rdma_read_info_free(info);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 7b94d971feb3..68af79d4f04f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -358,49 +358,42 @@ static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt)
/**
* svc_rdma_encode_write_segment - Encode one Write segment
- * @src: matching Write chunk in the RPC Call header
* @sctxt: Send context for the RPC Reply
+ * @chunk: Write chunk to push
* @remaining: remaining bytes of the payload left in the Write chunk
+ * @segno: which segment in the chunk
*
* Return values:
* On success, returns length in bytes of the Reply XDR buffer
- * that was consumed by the Write segment
+ * that was consumed by the Write segment, and updates @remaining
* %-EMSGSIZE on XDR buffer overflow
*/
-static ssize_t svc_rdma_encode_write_segment(__be32 *src,
- struct svc_rdma_send_ctxt *sctxt,
- unsigned int *remaining)
+static ssize_t svc_rdma_encode_write_segment(struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_chunk *chunk,
+ u32 *remaining, unsigned int segno)
{
+ const struct svc_rdma_segment *segment = &chunk->ch_segments[segno];
+ const size_t len = rpcrdma_segment_maxsz * sizeof(__be32);
+ u32 length;
__be32 *p;
- const size_t len = rpcrdma_segment_maxsz * sizeof(*p);
- u32 handle, length;
- u64 offset;
p = xdr_reserve_space(&sctxt->sc_stream, len);
if (!p)
return -EMSGSIZE;
- xdr_decode_rdma_segment(src, &handle, &length, &offset);
-
- if (*remaining < length) {
- /* segment only partly filled */
- length = *remaining;
- *remaining = 0;
- } else {
- /* entire segment was consumed */
- *remaining -= length;
- }
- xdr_encode_rdma_segment(p, handle, length, offset);
-
- trace_svcrdma_encode_wseg(handle, length, offset);
+ length = min_t(u32, *remaining, segment->rs_length);
+ *remaining -= length;
+ xdr_encode_rdma_segment(p, segment->rs_handle, length,
+ segment->rs_offset);
+ trace_svcrdma_encode_wseg(sctxt, segno, segment->rs_handle, length,
+ segment->rs_offset);
return len;
}
/**
* svc_rdma_encode_write_chunk - Encode one Write chunk
- * @src: matching Write chunk in the RPC Call header
* @sctxt: Send context for the RPC Reply
- * @remaining: size in bytes of the payload in the Write chunk
+ * @chunk: Write chunk to push
*
* Copy a Write chunk from the Call transport header to the
* Reply transport header. Update each segment's length field
@@ -411,33 +404,28 @@ static ssize_t svc_rdma_encode_write_segment(__be32 *src,
* that was consumed by the Write chunk
* %-EMSGSIZE on XDR buffer overflow
*/
-static ssize_t svc_rdma_encode_write_chunk(__be32 *src,
- struct svc_rdma_send_ctxt *sctxt,
- unsigned int remaining)
+static ssize_t svc_rdma_encode_write_chunk(struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_chunk *chunk)
{
- unsigned int i, nsegs;
+ u32 remaining = chunk->ch_payload_length;
+ unsigned int segno;
ssize_t len, ret;
len = 0;
- trace_svcrdma_encode_write_chunk(remaining);
-
- src++;
ret = xdr_stream_encode_item_present(&sctxt->sc_stream);
if (ret < 0)
- return -EMSGSIZE;
+ return ret;
len += ret;
- nsegs = be32_to_cpup(src++);
- ret = xdr_stream_encode_u32(&sctxt->sc_stream, nsegs);
+ ret = xdr_stream_encode_u32(&sctxt->sc_stream, chunk->ch_segcount);
if (ret < 0)
- return -EMSGSIZE;
+ return ret;
len += ret;
- for (i = nsegs; i; i--) {
- ret = svc_rdma_encode_write_segment(src, sctxt, &remaining);
+ for (segno = 0; segno < chunk->ch_segcount; segno++) {
+ ret = svc_rdma_encode_write_segment(sctxt, chunk, &remaining, segno);
if (ret < 0)
- return -EMSGSIZE;
- src += rpcrdma_segment_maxsz;
+ return ret;
len += ret;
}
@@ -448,32 +436,25 @@ static ssize_t svc_rdma_encode_write_chunk(__be32 *src,
* svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list
* @rctxt: Reply context with information about the RPC Call
* @sctxt: Send context for the RPC Reply
- * @length: size in bytes of the payload in the first Write chunk
- *
- * The client provides a Write chunk list in the Call message. Fill
- * in the segments in the first Write chunk in the Reply's transport
- * header with the number of bytes consumed in each segment.
- * Remaining chunks are returned unused.
- *
- * Assumptions:
- * - Client has provided only one Write chunk
*
* Return values:
* On success, returns length in bytes of the Reply XDR buffer
* that was consumed by the Reply's Write list
* %-EMSGSIZE on XDR buffer overflow
*/
-static ssize_t
-svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt,
- struct svc_rdma_send_ctxt *sctxt,
- unsigned int length)
+static ssize_t svc_rdma_encode_write_list(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt)
{
+ struct svc_rdma_chunk *chunk;
ssize_t len, ret;
- ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt, length);
- if (ret < 0)
- return ret;
- len = ret;
+ len = 0;
+ pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
+ ret = svc_rdma_encode_write_chunk(sctxt, chunk);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ }
/* Terminate the Write list */
ret = xdr_stream_encode_item_absent(&sctxt->sc_stream);
@@ -489,56 +470,174 @@ svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt,
* @sctxt: Send context for the RPC Reply
* @length: size in bytes of the payload in the Reply chunk
*
- * Assumptions:
- * - Reply can always fit in the client-provided Reply chunk
- *
* Return values:
* On success, returns length in bytes of the Reply XDR buffer
* that was consumed by the Reply's Reply chunk
* %-EMSGSIZE on XDR buffer overflow
+ * %-E2BIG if the RPC message is larger than the Reply chunk
*/
static ssize_t
-svc_rdma_encode_reply_chunk(const struct svc_rdma_recv_ctxt *rctxt,
+svc_rdma_encode_reply_chunk(struct svc_rdma_recv_ctxt *rctxt,
struct svc_rdma_send_ctxt *sctxt,
unsigned int length)
{
- return svc_rdma_encode_write_chunk(rctxt->rc_reply_chunk, sctxt,
- length);
+ struct svc_rdma_chunk *chunk;
+
+ if (pcl_is_empty(&rctxt->rc_reply_pcl))
+ return xdr_stream_encode_item_absent(&sctxt->sc_stream);
+
+ chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
+ if (length > chunk->ch_length)
+ return -E2BIG;
+
+ chunk->ch_payload_length = length;
+ return svc_rdma_encode_write_chunk(sctxt, chunk);
}
-static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct page *page,
- unsigned long offset,
- unsigned int len)
+struct svc_rdma_map_data {
+ struct svcxprt_rdma *md_rdma;
+ struct svc_rdma_send_ctxt *md_ctxt;
+};
+
+/**
+ * svc_rdma_page_dma_map - DMA map one page
+ * @data: pointer to arguments
+ * @page: struct page to DMA map
+ * @offset: offset into the page
+ * @len: number of bytes to map
+ *
+ * Returns:
+ * %0 if DMA mapping was successful
+ * %-EIO if the page cannot be DMA mapped
+ */
+static int svc_rdma_page_dma_map(void *data, struct page *page,
+ unsigned long offset, unsigned int len)
{
+ struct svc_rdma_map_data *args = data;
+ struct svcxprt_rdma *rdma = args->md_rdma;
+ struct svc_rdma_send_ctxt *ctxt = args->md_ctxt;
struct ib_device *dev = rdma->sc_cm_id->device;
dma_addr_t dma_addr;
+ ++ctxt->sc_cur_sge_no;
+
dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
- trace_svcrdma_dma_map_page(rdma, dma_addr, len);
if (ib_dma_mapping_error(dev, dma_addr))
goto out_maperr;
+ trace_svcrdma_dma_map_page(rdma, dma_addr, len);
ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr;
ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len;
ctxt->sc_send_wr.num_sge++;
return 0;
out_maperr:
+ trace_svcrdma_dma_map_err(rdma, dma_addr, len);
return -EIO;
}
-/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+/**
+ * svc_rdma_iov_dma_map - DMA map an iovec
+ * @data: pointer to arguments
+ * @iov: kvec to DMA map
+ *
+ * ib_dma_map_page() is used here because svc_rdma_dma_unmap()
* handles DMA-unmap and it uses ib_dma_unmap_page() exclusively.
+ *
+ * Returns:
+ * %0 if DMA mapping was successful
+ * %-EIO if the iovec cannot be DMA mapped
*/
-static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- unsigned char *base,
- unsigned int len)
+static int svc_rdma_iov_dma_map(void *data, const struct kvec *iov)
{
- return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base),
- offset_in_page(base), len);
+ if (!iov->iov_len)
+ return 0;
+ return svc_rdma_page_dma_map(data, virt_to_page(iov->iov_base),
+ offset_in_page(iov->iov_base),
+ iov->iov_len);
+}
+
+/**
+ * svc_rdma_xb_dma_map - DMA map all segments of an xdr_buf
+ * @xdr: xdr_buf containing portion of an RPC message to transmit
+ * @data: pointer to arguments
+ *
+ * Returns:
+ * %0 if DMA mapping was successful
+ * %-EIO if DMA mapping failed
+ *
+ * On failure, any DMA mappings that have been already done must be
+ * unmapped by the caller.
+ */
+static int svc_rdma_xb_dma_map(const struct xdr_buf *xdr, void *data)
+{
+ unsigned int len, remaining;
+ unsigned long pageoff;
+ struct page **ppages;
+ int ret;
+
+ ret = svc_rdma_iov_dma_map(data, &xdr->head[0]);
+ if (ret < 0)
+ return ret;
+
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ pageoff = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+
+ ret = svc_rdma_page_dma_map(data, *ppages++, pageoff, len);
+ if (ret < 0)
+ return ret;
+
+ remaining -= len;
+ pageoff = 0;
+ }
+
+ ret = svc_rdma_iov_dma_map(data, &xdr->tail[0]);
+ if (ret < 0)
+ return ret;
+
+ return xdr->len;
+}
+
+struct svc_rdma_pullup_data {
+ u8 *pd_dest;
+ unsigned int pd_length;
+ unsigned int pd_num_sges;
+};
+
+/**
+ * svc_rdma_xb_count_sges - Count how many SGEs will be needed
+ * @xdr: xdr_buf containing portion of an RPC message to transmit
+ * @data: pointer to arguments
+ *
+ * Returns:
+ * Number of SGEs needed to Send the contents of @xdr inline
+ */
+static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr,
+ void *data)
+{
+ struct svc_rdma_pullup_data *args = data;
+ unsigned int remaining;
+ unsigned long offset;
+
+ if (xdr->head[0].iov_len)
+ ++args->pd_num_sges;
+
+ offset = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ ++args->pd_num_sges;
+ remaining -= min_t(u32, PAGE_SIZE - offset, remaining);
+ offset = 0;
+ }
+
+ if (xdr->tail[0].iov_len)
+ ++args->pd_num_sges;
+
+ args->pd_length += xdr->len;
+ return 0;
}
/**
@@ -549,48 +648,71 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
* @xdr: xdr_buf containing RPC message to transmit
*
* Returns:
- * %true if pull-up must be used
- * %false otherwise
+ * %true if pull-up must be used
+ * %false otherwise
*/
-static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *sctxt,
+static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma,
+ const struct svc_rdma_send_ctxt *sctxt,
const struct svc_rdma_recv_ctxt *rctxt,
- struct xdr_buf *xdr)
+ const struct xdr_buf *xdr)
{
- int elements;
+ /* Resources needed for the transport header */
+ struct svc_rdma_pullup_data args = {
+ .pd_length = sctxt->sc_hdrbuf.len,
+ .pd_num_sges = 1,
+ };
+ int ret;
- /* For small messages, copying bytes is cheaper than DMA mapping.
- */
- if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH)
+ ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_count_sges, &args);
+ if (ret < 0)
+ return false;
+
+ if (args.pd_length < RPCRDMA_PULLUP_THRESH)
return true;
+ return args.pd_num_sges >= rdma->sc_max_send_sges;
+}
- /* Check whether the xdr_buf has more elements than can
- * fit in a single RDMA Send.
- */
- /* xdr->head */
- elements = 1;
-
- /* xdr->pages */
- if (!rctxt || !rctxt->rc_write_list) {
- unsigned int remaining;
- unsigned long pageoff;
-
- pageoff = xdr->page_base & ~PAGE_MASK;
- remaining = xdr->page_len;
- while (remaining) {
- ++elements;
- remaining -= min_t(u32, PAGE_SIZE - pageoff,
- remaining);
- pageoff = 0;
- }
+/**
+ * svc_rdma_xb_linearize - Copy region of xdr_buf to flat buffer
+ * @xdr: xdr_buf containing portion of an RPC message to copy
+ * @data: pointer to arguments
+ *
+ * Returns:
+ * Always zero.
+ */
+static int svc_rdma_xb_linearize(const struct xdr_buf *xdr,
+ void *data)
+{
+ struct svc_rdma_pullup_data *args = data;
+ unsigned int len, remaining;
+ unsigned long pageoff;
+ struct page **ppages;
+
+ if (xdr->head[0].iov_len) {
+ memcpy(args->pd_dest, xdr->head[0].iov_base, xdr->head[0].iov_len);
+ args->pd_dest += xdr->head[0].iov_len;
}
- /* xdr->tail */
- if (xdr->tail[0].iov_len)
- ++elements;
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ pageoff = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+ memcpy(args->pd_dest, page_address(*ppages) + pageoff, len);
+ remaining -= len;
+ args->pd_dest += len;
+ pageoff = 0;
+ ppages++;
+ }
- /* assume 1 SGE is needed for the transport header */
- return elements >= rdma->sc_max_send_sges;
+ if (xdr->tail[0].iov_len) {
+ memcpy(args->pd_dest, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+ args->pd_dest += xdr->tail[0].iov_len;
+ }
+
+ args->pd_length += xdr->len;
+ return 0;
}
/**
@@ -603,53 +725,30 @@ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
* The device is not capable of sending the reply directly.
* Assemble the elements of @xdr into the transport header buffer.
*
- * Returns zero on success, or a negative errno on failure.
+ * Assumptions:
+ * pull_up_needed has determined that @xdr will fit in the buffer.
+ *
+ * Returns:
+ * %0 if pull-up was successful
+ * %-EMSGSIZE if a buffer manipulation problem occurred
*/
-static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
+static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *sctxt,
const struct svc_rdma_recv_ctxt *rctxt,
const struct xdr_buf *xdr)
{
- unsigned char *dst, *tailbase;
- unsigned int taillen;
-
- dst = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len;
- memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len);
- dst += xdr->head[0].iov_len;
-
- tailbase = xdr->tail[0].iov_base;
- taillen = xdr->tail[0].iov_len;
- if (rctxt && rctxt->rc_write_list) {
- u32 xdrpad;
-
- xdrpad = xdr_pad_size(xdr->page_len);
- if (taillen && xdrpad) {
- tailbase += xdrpad;
- taillen -= xdrpad;
- }
- } else {
- unsigned int len, remaining;
- unsigned long pageoff;
- struct page **ppages;
-
- ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
- pageoff = xdr->page_base & ~PAGE_MASK;
- remaining = xdr->page_len;
- while (remaining) {
- len = min_t(u32, PAGE_SIZE - pageoff, remaining);
-
- memcpy(dst, page_address(*ppages), len);
- remaining -= len;
- dst += len;
- pageoff = 0;
- }
- }
+ struct svc_rdma_pullup_data args = {
+ .pd_dest = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len,
+ };
+ int ret;
- if (taillen)
- memcpy(dst, tailbase, taillen);
+ ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_linearize, &args);
+ if (ret < 0)
+ return ret;
- sctxt->sc_sges[0].length += xdr->len;
- trace_svcrdma_send_pullup(sctxt->sc_sges[0].length);
+ sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len + args.pd_length;
+ trace_svcrdma_send_pullup(sctxt, args.pd_length);
return 0;
}
@@ -659,22 +758,22 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
* @rctxt: Write and Reply chunks provided by client
* @xdr: prepared xdr_buf containing RPC message
*
- * Load the xdr_buf into the ctxt's sge array, and DMA map each
- * element as it is added. The Send WR's num_sge field is set.
+ * Returns:
+ * %0 if DMA mapping was successful.
+ * %-EMSGSIZE if a buffer manipulation problem occurred
+ * %-EIO if DMA mapping failed
*
- * Returns zero on success, or a negative errno on failure.
+ * The Send WR's num_sge field is set in all cases.
*/
int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *sctxt,
const struct svc_rdma_recv_ctxt *rctxt,
- struct xdr_buf *xdr)
+ const struct xdr_buf *xdr)
{
- unsigned int len, remaining;
- unsigned long page_off;
- struct page **ppages;
- unsigned char *base;
- u32 xdr_pad;
- int ret;
+ struct svc_rdma_map_data args = {
+ .md_rdma = rdma,
+ .md_ctxt = sctxt,
+ };
/* Set up the (persistently-mapped) transport header SGE. */
sctxt->sc_send_wr.num_sge = 1;
@@ -683,7 +782,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
/* If there is a Reply chunk, nothing follows the transport
* header, and we're done here.
*/
- if (rctxt && rctxt->rc_reply_chunk)
+ if (!pcl_is_empty(&rctxt->rc_reply_pcl))
return 0;
/* For pull-up, svc_rdma_send() will sync the transport header.
@@ -692,58 +791,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr))
return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr);
- ++sctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_buf(rdma, sctxt,
- xdr->head[0].iov_base,
- xdr->head[0].iov_len);
- if (ret < 0)
- return ret;
-
- /* If a Write chunk is present, the xdr_buf's page list
- * is not included inline. However the Upper Layer may
- * have added XDR padding in the tail buffer, and that
- * should not be included inline.
- */
- if (rctxt && rctxt->rc_write_list) {
- base = xdr->tail[0].iov_base;
- len = xdr->tail[0].iov_len;
- xdr_pad = xdr_pad_size(xdr->page_len);
-
- if (len && xdr_pad) {
- base += xdr_pad;
- len -= xdr_pad;
- }
-
- goto tail;
- }
-
- ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
- page_off = xdr->page_base & ~PAGE_MASK;
- remaining = xdr->page_len;
- while (remaining) {
- len = min_t(u32, PAGE_SIZE - page_off, remaining);
-
- ++sctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_page(rdma, sctxt, *ppages++,
- page_off, len);
- if (ret < 0)
- return ret;
-
- remaining -= len;
- page_off = 0;
- }
-
- base = xdr->tail[0].iov_base;
- len = xdr->tail[0].iov_len;
-tail:
- if (len) {
- ++sctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_buf(rdma, sctxt, base, len);
- if (ret < 0)
- return ret;
- }
-
- return 0;
+ return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_dma_map, &args);
}
/* The svc_rqst and all resources it owns are released as soon as
@@ -893,9 +942,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
__be32 *rdma_argp = rctxt->rc_recv_buf;
- __be32 *wr_lst = rctxt->rc_write_list;
- __be32 *rp_ch = rctxt->rc_reply_chunk;
- struct xdr_buf *xdr = &rqstp->rq_res;
struct svc_rdma_send_ctxt *sctxt;
__be32 *p;
int ret;
@@ -913,45 +959,22 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
rpcrdma_fixed_maxsz * sizeof(*p));
if (!p)
goto err0;
+
+ ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
+ if (ret < 0)
+ goto err2;
+
*p++ = *rdma_argp;
*p++ = *(rdma_argp + 1);
*p++ = rdma->sc_fc_credits;
- *p = rp_ch ? rdma_nomsg : rdma_msg;
+ *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg;
if (svc_rdma_encode_read_list(sctxt) < 0)
goto err0;
- if (wr_lst) {
- /* XXX: Presume the client sent only one Write chunk */
- unsigned long offset;
- unsigned int length;
-
- if (rctxt->rc_read_payload_length) {
- offset = rctxt->rc_read_payload_offset;
- length = rctxt->rc_read_payload_length;
- } else {
- offset = xdr->head[0].iov_len;
- length = xdr->page_len;
- }
- ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, offset,
- length);
- if (ret < 0)
- goto err2;
- if (svc_rdma_encode_write_list(rctxt, sctxt, length) < 0)
- goto err0;
- } else {
- if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
- goto err0;
- }
- if (rp_ch) {
- ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
- if (ret < 0)
- goto err2;
- if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0)
- goto err0;
- } else {
- if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
- goto err0;
- }
+ if (svc_rdma_encode_write_list(rctxt, sctxt) < 0)
+ goto err0;
+ if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0)
+ goto err0;
ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
if (ret < 0)
@@ -978,28 +1001,46 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
}
/**
- * svc_rdma_read_payload - special processing for a READ payload
+ * svc_rdma_result_payload - special processing for a result payload
* @rqstp: svc_rqst to operate on
* @offset: payload's byte offset in @xdr
* @length: size of payload, in bytes
*
- * Returns zero on success.
- *
- * For the moment, just record the xdr_buf location of the READ
- * payload. svc_rdma_sendto will use that location later when
- * we actually send the payload.
+ * Return values:
+ * %0 if successful or nothing needed to be done
+ * %-EMSGSIZE on XDR buffer overflow
+ * %-E2BIG if the payload was larger than the Write chunk
+ * %-EINVAL if client provided too many segments
+ * %-ENOMEM if rdma_rw context pool was exhausted
+ * %-ENOTCONN if posting failed (connection is lost)
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc)
*/
-int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset,
- unsigned int length)
+int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
{
struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
+ struct svc_rdma_chunk *chunk;
+ struct svcxprt_rdma *rdma;
+ struct xdr_buf subbuf;
+ int ret;
- /* XXX: Just one READ payload slot for now, since our
- * transport implementation currently supports only one
- * Write chunk.
- */
- rctxt->rc_read_payload_offset = offset;
- rctxt->rc_read_payload_length = length;
+ chunk = rctxt->rc_cur_result_payload;
+ if (!length || !chunk)
+ return 0;
+ rctxt->rc_cur_result_payload =
+ pcl_next_chunk(&rctxt->rc_write_pcl, chunk);
+ if (length > chunk->ch_length)
+ return -E2BIG;
+ chunk->ch_position = offset;
+ chunk->ch_payload_length = length;
+
+ if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length))
+ return -EMSGSIZE;
+
+ rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
+ ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf);
+ if (ret < 0)
+ return ret;
return 0;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fb044792b571..afba4e9d5425 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -80,7 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = {
.xpo_create = svc_rdma_create,
.xpo_recvfrom = svc_rdma_recvfrom,
.xpo_sendto = svc_rdma_sendto,
- .xpo_read_payload = svc_rdma_read_payload,
+ .xpo_result_payload = svc_rdma_result_payload,
.xpo_release_rqst = svc_rdma_release_rqst,
.xpo_detach = svc_rdma_detach,
.xpo_free = svc_rdma_free,
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 053c8ab1265a..78d29d1bcc20 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -413,9 +413,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
snprintf(buf, sizeof(buf), "%4hx", port);
xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
-
- trace_xprtrdma_op_setport(container_of(xprt, struct rpcrdma_xprt,
- rx_xprt));
}
/**
@@ -586,11 +583,9 @@ xprt_rdma_allocate(struct rpc_task *task)
rqst->rq_buffer = rdmab_data(req->rl_sendbuf);
rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf);
- trace_xprtrdma_op_allocate(task, req);
return 0;
out_fail:
- trace_xprtrdma_op_allocate(task, NULL);
return -ENOMEM;
}
@@ -604,13 +599,12 @@ static void
xprt_rdma_free(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- trace_xprtrdma_op_free(task, req);
-
- if (!list_empty(&req->rl_registered))
- frwr_unmap_sync(r_xprt, req);
+ if (unlikely(!list_empty(&req->rl_registered))) {
+ trace_xprtrdma_mrs_zap(task);
+ frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req);
+ }
/* XXX: If the RPC is completing because of a signal and
* not because a reply was received, we ought to ensure
@@ -775,6 +769,7 @@ static struct xprt_class xprt_rdma = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_RDMA,
.setup = xprt_setup_rdma,
+ .netid = { "rdma", "rdma6", "" },
};
void xprt_rdma_cleanup(void)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index ad6e2e4994ce..ec912cf9c618 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -167,7 +167,7 @@ static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_send(sc, wc);
+ trace_xprtrdma_wc_send(wc, &sc->sc_cid);
rpcrdma_sendctx_put_locked(r_xprt, sc);
rpcrdma_flush_disconnect(r_xprt, wc);
}
@@ -186,7 +186,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_receive(wc);
+ trace_xprtrdma_wc_receive(wc, &rep->rr_cid);
--r_xprt->rx_ep->re_receive_count;
if (wc->status != IB_WC_SUCCESS)
goto out_flushed;
@@ -643,6 +643,9 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
return NULL;
sc->sc_cqe.done = rpcrdma_wc_send;
+ sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id;
+ sc->sc_cid.ci_completion_id =
+ atomic_inc_return(&ep->re_completion_ids);
return sc;
}
@@ -972,6 +975,9 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
goto out_free_regbuf;
+ rep->rr_cid.ci_completion_id =
+ atomic_inc_return(&r_xprt->rx_ep->re_completion_ids);
+
xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
rdmab_length(rep->rr_rdmabuf));
rep->rr_cqe.done = rpcrdma_wc_receive;
@@ -1179,25 +1185,6 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
}
/**
- * rpcrdma_mr_put - DMA unmap an MR and release it
- * @mr: MR to release
- *
- */
-void rpcrdma_mr_put(struct rpcrdma_mr *mr)
-{
- struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-
- if (mr->mr_dir != DMA_NONE) {
- trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
- mr->mr_sg, mr->mr_nents, mr->mr_dir);
- mr->mr_dir = DMA_NONE;
- }
-
- rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
-}
-
-/**
* rpcrdma_buffer_get - Get a request buffer
* @buffers: Buffer pool from which to obtain a buffer
*
@@ -1411,6 +1398,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
if (!rep)
break;
+ rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id;
trace_xprtrdma_post_recv(rep);
rep->rr_recv_wr.next = wr;
wr = &rep->rr_recv_wr;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 43974ef39a50..94b28657aeeb 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -53,6 +53,7 @@
#include <rdma/ib_verbs.h> /* RDMA verbs api */
#include <linux/sunrpc/clnt.h> /* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */
#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
@@ -93,6 +94,8 @@ struct rpcrdma_ep {
unsigned int re_max_requests; /* depends on device */
unsigned int re_inline_send; /* negotiated */
unsigned int re_inline_recv; /* negotiated */
+
+ atomic_t re_completion_ids;
};
/* Pre-allocate extra Work Requests for handling backward receives
@@ -180,6 +183,8 @@ enum {
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
+ struct rpc_rdma_cid rr_cid;
+
__be32 rr_xid;
__be32 rr_vers;
__be32 rr_proc;
@@ -211,6 +216,7 @@ enum {
struct rpcrdma_req;
struct rpcrdma_sendctx {
struct ib_cqe sc_cqe;
+ struct rpc_rdma_cid sc_cid;
struct rpcrdma_req *sc_req;
unsigned int sc_unmap_count;
struct ib_sge sc_sges[];
@@ -225,6 +231,7 @@ struct rpcrdma_sendctx {
struct rpcrdma_frwr {
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
+ struct rpc_rdma_cid fr_cid;
struct completion fr_linv_done;
union {
struct ib_reg_wr fr_regwr;
@@ -236,6 +243,7 @@ struct rpcrdma_req;
struct rpcrdma_mr {
struct list_head mr_list;
struct rpcrdma_req *mr_req;
+ struct ib_device *mr_device;
struct scatterlist *mr_sg;
int mr_nents;
enum dma_data_direction mr_dir;
@@ -466,7 +474,6 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_mr_put(struct rpcrdma_mr *mr);
void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 554e1bb4c1c7..c56a66cdf4ac 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -433,7 +433,8 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
if (ret <= 0)
goto sock_err;
xs_flush_bvec(buf->bvec, ret, seek + buf->page_base);
- offset += ret - buf->page_base;
+ ret -= buf->page_base;
+ offset += ret;
if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
goto out;
if (ret != want)
@@ -762,10 +763,7 @@ static int xs_nospace(struct rpc_rqst *req)
struct sock *sk = transport->inet;
int ret = -EAGAIN;
- dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
- req->rq_task->tk_pid,
- req->rq_slen - transport->xmit.offset,
- req->rq_slen);
+ trace_rpc_socket_nospace(req, transport);
/* Protect against races with write_space */
spin_lock(&xprt->transport_lock);
@@ -3062,6 +3060,7 @@ static struct xprt_class xs_local_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_LOCAL,
.setup = xs_setup_local,
+ .netid = { "" },
};
static struct xprt_class xs_udp_transport = {
@@ -3070,6 +3069,7 @@ static struct xprt_class xs_udp_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_UDP,
.setup = xs_setup_udp,
+ .netid = { "udp", "udp6", "" },
};
static struct xprt_class xs_tcp_transport = {
@@ -3078,6 +3078,7 @@ static struct xprt_class xs_tcp_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_TCP,
.setup = xs_setup_tcp,
+ .netid = { "tcp", "tcp6", "" },
};
static struct xprt_class xs_bc_tcp_transport = {
@@ -3086,6 +3087,7 @@ static struct xprt_class xs_bc_tcp_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_BC_TCP,
.setup = xs_setup_bc_tcp,
+ .netid = { "" },
};
/**
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 23d868545362..2c1ffc9ba2eb 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -460,10 +460,11 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev,
extack = switchdev_notifier_info_to_extack(&port_obj_info->info);
if (check_cb(dev)) {
- /* This flag is only checked if the return value is success. */
- port_obj_info->handled = true;
- return add_cb(dev, port_obj_info->obj, port_obj_info->trans,
- extack);
+ err = add_cb(dev, port_obj_info->obj, port_obj_info->trans,
+ extack);
+ if (err != -EOPNOTSUPP)
+ port_obj_info->handled = true;
+ return err;
}
/* Switch ports might be stacked under e.g. a LAG. Ignore the
@@ -515,9 +516,10 @@ static int __switchdev_handle_port_obj_del(struct net_device *dev,
int err = -EOPNOTSUPP;
if (check_cb(dev)) {
- /* This flag is only checked if the return value is success. */
- port_obj_info->handled = true;
- return del_cb(dev, port_obj_info->obj);
+ err = del_cb(dev, port_obj_info->obj);
+ if (err != -EOPNOTSUPP)
+ port_obj_info->handled = true;
+ return err;
}
/* Switch ports might be stacked under e.g. a LAG. Ignore the
@@ -568,9 +570,10 @@ static int __switchdev_handle_port_attr_set(struct net_device *dev,
int err = -EOPNOTSUPP;
if (check_cb(dev)) {
- port_attr_info->handled = true;
- return set_cb(dev, port_attr_info->attr,
- port_attr_info->trans);
+ err = set_cb(dev, port_attr_info->attr, port_attr_info->trans);
+ if (err != -EOPNOTSUPP)
+ port_attr_info->handled = true;
+ return err;
}
/* Switch ports might be stacked under e.g. a LAG. Ignore the
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index 0f1eaed1bd1b..abe29d1aa23a 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -55,12 +55,11 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr)
void tipc_set_node_id(struct net *net, u8 *id)
{
struct tipc_net *tn = tipc_net(net);
- u32 *tmp = (u32 *)id;
memcpy(tn->node_id, id, NODE_ID_LEN);
tipc_nodeid2string(tn->node_id_string, id);
- tn->trial_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3];
- pr_info("Own node identity %s, cluster identity %u\n",
+ tn->trial_addr = hash128to32(id);
+ pr_info("Node identity %s, cluster identity %u\n",
tipc_own_id_string(net), tn->net_id);
}
@@ -76,7 +75,7 @@ void tipc_set_node_addr(struct net *net, u32 addr)
}
tn->trial_addr = addr;
tn->addr_trial_end = jiffies;
- pr_info("32-bit node address hash set to %x\n", addr);
+ pr_info("Node number set to %u\n", addr);
}
char *tipc_nodeid2string(char *str, u8 *id)
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 31bee0ea7b3e..1a11831bef62 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -3,6 +3,7 @@
*
* Copyright (c) 2000-2006, 2018, Ericsson AB
* Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2020, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 940d176e0e87..d4beca895992 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -108,6 +108,8 @@ static void tipc_bcbase_select_primary(struct net *net)
{
struct tipc_bc_base *bb = tipc_bc_base(net);
int all_dests = tipc_link_bc_peers(bb->link);
+ int max_win = tipc_link_max_win(bb->link);
+ int min_win = tipc_link_min_win(bb->link);
int i, mtu, prim;
bb->primary_bearer = INVALID_BEARER_ID;
@@ -121,8 +123,12 @@ static void tipc_bcbase_select_primary(struct net *net)
continue;
mtu = tipc_bearer_mtu(net, i);
- if (mtu < tipc_link_mtu(bb->link))
+ if (mtu < tipc_link_mtu(bb->link)) {
tipc_link_set_mtu(bb->link, mtu);
+ tipc_link_set_queue_limits(bb->link,
+ min_win,
+ max_win);
+ }
bb->bcast_support &= tipc_bearer_bcast_support(net, i);
if (bb->dests[i] < all_dests)
continue;
@@ -585,7 +591,7 @@ static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win)
if (max_win > TIPC_MAX_LINK_WIN)
return -EINVAL;
tipc_bcast_lock(net);
- tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win);
+ tipc_link_set_queue_limits(l, tipc_link_min_win(l), max_win);
tipc_bcast_unlock(net);
return 0;
}
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 650414110452..a4389ef08a98 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -72,6 +72,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
/**
* tipc_media_find - locates specified media object by name
+ * @name: name to locate
*/
struct tipc_media *tipc_media_find(const char *name)
{
@@ -86,6 +87,7 @@ struct tipc_media *tipc_media_find(const char *name)
/**
* media_find_id - locates specified media object by type identifier
+ * @type: type identifier to locate
*/
static struct tipc_media *media_find_id(u8 type)
{
@@ -100,6 +102,9 @@ static struct tipc_media *media_find_id(u8 type)
/**
* tipc_media_addr_printf - record media address in print buffer
+ * @buf: output buffer
+ * @len: output buffer size remaining
+ * @a: input media address
*/
int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a)
{
@@ -127,7 +132,7 @@ int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a)
* @name: ptr to bearer name string
* @name_parts: ptr to area for bearer name components (or NULL if not needed)
*
- * Returns 1 if bearer name is valid, otherwise 0.
+ * Return: 1 if bearer name is valid, otherwise 0.
*/
static int bearer_name_validate(const char *name,
struct tipc_bearer_names *name_parts)
@@ -139,10 +144,7 @@ static int bearer_name_validate(const char *name,
u32 if_len;
/* copy bearer name & ensure length is OK */
- name_copy[TIPC_MAX_BEARER_NAME - 1] = 0;
- /* need above in case non-Posix strncpy() doesn't pad with nulls */
- strncpy(name_copy, name, TIPC_MAX_BEARER_NAME);
- if (name_copy[TIPC_MAX_BEARER_NAME - 1] != 0)
+ if (strscpy(name_copy, name, TIPC_MAX_BEARER_NAME) < 0)
return 0;
/* ensure all component parts of bearer name are present */
@@ -169,6 +171,8 @@ static int bearer_name_validate(const char *name,
/**
* tipc_bearer_find - locates bearer object with matching bearer name
+ * @net: the applicable net namespace
+ * @name: bearer name to locate
*/
struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name)
{
@@ -231,6 +235,11 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest)
/**
* tipc_enable_bearer - enable bearer with the given name
+ * @net: the applicable net namespace
+ * @name: bearer name to enable
+ * @disc_domain: bearer domain
+ * @prio: bearer priority
+ * @attr: nlattr array
*/
static int tipc_enable_bearer(struct net *net, const char *name,
u32 disc_domain, u32 prio,
@@ -345,6 +354,8 @@ rejected:
/**
* tipc_reset_bearer - Reset all links established over this bearer
+ * @net: the applicable net namespace
+ * @b: the target bearer
*/
static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b)
{
@@ -366,7 +377,9 @@ void tipc_bearer_put(struct tipc_bearer *b)
}
/**
- * bearer_disable
+ * bearer_disable - disable this bearer
+ * @net: the applicable net namespace
+ * @b: the bearer to disable
*
* Note: This routine assumes caller holds RTNL lock.
*/
@@ -437,6 +450,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
}
/* tipc_disable_l2_media - detach TIPC bearer from an L2 interface
+ * @b: the target bearer
*
* Mark L2 bearer as inactive so that incoming buffers are thrown away
*/
@@ -453,6 +467,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b)
/**
* tipc_l2_send_msg - send a TIPC packet out over an L2 interface
+ * @net: the associated network namespace
* @skb: the packet to be sent
* @b: the bearer through which the packet is to be sent
* @dest: peer destination address
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index bc0023119da2..6bf4550aa1ac 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -93,7 +93,8 @@ struct tipc_bearer;
* @raw2addr: convert from raw addr format to media addr format
* @priority: default link (and bearer) priority
* @tolerance: default time (in ms) before declaring link failure
- * @window: default window (in packets) before declaring link congestion
+ * @min_win: minimum window (in packets) before declaring link congestion
+ * @max_win: maximum window (in packets) before declaring link congestion
* @mtu: max packet size bearer can support for media type not dependent on
* underlying device MTU
* @type_id: TIPC media identifier
@@ -138,12 +139,15 @@ struct tipc_media {
* @pt: packet type for bearer
* @rcu: rcu struct for tipc_bearer
* @priority: default link priority for bearer
- * @window: default window size for bearer
+ * @min_win: minimum window (in packets) before declaring link congestion
+ * @max_win: maximum window (in packets) before declaring link congestion
* @tolerance: default link tolerance for bearer
* @domain: network domain to which links can be established
* @identity: array index of this bearer within TIPC bearer array
- * @link_req: ptr to (optional) structure making periodic link setup requests
+ * @disc: ptr to link setup request
* @net_plane: network plane ('A' through 'H') currently associated with bearer
+ * @up: bearer up flag (bit 0)
+ * @refcnt: tipc_bearer reference counter
*
* Note: media-specific code is responsible for initialization of the fields
* indicated below when a bearer is enabled; TIPC's generic bearer code takes
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 4f6dc74adf45..5cc1f0307215 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -60,6 +60,7 @@ static int __net_init tipc_init_net(struct net *net)
tn->trial_addr = 0;
tn->addr_trial_end = 0;
tn->capabilities = TIPC_NODE_CAPABILITIES;
+ INIT_WORK(&tn->final_work.work, tipc_net_finalize_work);
memset(tn->node_id, 0, sizeof(tn->node_id));
memset(tn->node_id_string, 0, sizeof(tn->node_id_string));
tn->mon_threshold = TIPC_DEF_MON_THRESHOLD;
@@ -80,8 +81,6 @@ static int __net_init tipc_init_net(struct net *net)
if (err)
goto out_nametbl;
- INIT_LIST_HEAD(&tn->dist_queue);
-
err = tipc_bcast_init(net);
if (err)
goto out_bclink;
@@ -107,8 +106,13 @@ out_crypto:
static void __net_exit tipc_exit_net(struct net *net)
{
+ struct tipc_net *tn = tipc_net(net);
+
tipc_detach_loopback(net);
+ /* Make sure the tipc_net_finalize_work() finished */
+ cancel_work_sync(&tn->final_work.work);
tipc_net_stop(net);
+
tipc_bcast_stop(net);
tipc_nametbl_stop(net);
tipc_sk_rht_destroy(net);
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 631d83c9705f..03de7b213f55 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -3,6 +3,7 @@
*
* Copyright (c) 2005-2006, 2013-2018 Ericsson AB
* Copyright (c) 2005-2007, 2010-2013, Wind River Systems
+ * Copyright (c) 2020, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -90,6 +91,12 @@ extern unsigned int tipc_net_id __read_mostly;
extern int sysctl_tipc_rmem[3] __read_mostly;
extern int sysctl_tipc_named_timeout __read_mostly;
+struct tipc_net_work {
+ struct work_struct work;
+ struct net *net;
+ u32 addr;
+};
+
struct tipc_net {
u8 node_id[NODE_ID_LEN];
u32 node_addr;
@@ -126,9 +133,6 @@ struct tipc_net {
spinlock_t nametbl_lock;
struct name_table *nametbl;
- /* Name dist queue */
- struct list_head dist_queue;
-
/* Topology subscription server */
struct tipc_topsrv *topsrv;
atomic_t subscription_count;
@@ -143,6 +147,8 @@ struct tipc_net {
/* TX crypto handler */
struct tipc_crypto *crypto_tx;
#endif
+ /* Work item for net finalize */
+ struct tipc_net_work final_work;
};
static inline struct tipc_net *tipc_net(struct net *net)
@@ -205,6 +211,17 @@ static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand)
return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand;
}
+static inline u32 hash128to32(char *bytes)
+{
+ __be32 *tmp = (__be32 *)bytes;
+ u32 res;
+
+ res = ntohl(tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]);
+ if (likely(res))
+ return res;
+ return ntohl(tmp[0] | tmp[1] | tmp[2] | tmp[3]);
+}
+
#ifdef CONFIG_SYSCTL
int tipc_register_sysctl(void);
void tipc_unregister_sysctl(void);
diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index 7c523dc81575..f4fca8f7f63f 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/**
+/*
* net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption
*
* Copyright (c) 2019, Ericsson AB
@@ -36,28 +36,34 @@
#include <crypto/aead.h>
#include <crypto/aes.h>
+#include <crypto/rng.h>
#include "crypto.h"
+#include "msg.h"
+#include "bcast.h"
-#define TIPC_TX_PROBE_LIM msecs_to_jiffies(1000) /* > 1s */
-#define TIPC_TX_LASTING_LIM msecs_to_jiffies(120000) /* 2 mins */
+#define TIPC_TX_GRACE_PERIOD msecs_to_jiffies(5000) /* 5s */
+#define TIPC_TX_LASTING_TIME msecs_to_jiffies(10000) /* 10s */
#define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */
-#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(180000) /* 3 mins */
+#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(15000) /* 15s */
+
#define TIPC_MAX_TFMS_DEF 10
#define TIPC_MAX_TFMS_LIM 1000
-/**
+#define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */
+
+/*
* TIPC Key ids
*/
enum {
- KEY_UNUSED = 0,
- KEY_MIN,
- KEY_1 = KEY_MIN,
+ KEY_MASTER = 0,
+ KEY_MIN = KEY_MASTER,
+ KEY_1 = 1,
KEY_2,
KEY_3,
KEY_MAX = KEY_3,
};
-/**
+/*
* TIPC Crypto statistics
*/
enum {
@@ -81,8 +87,10 @@ static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok",
/* Max TFMs number per key */
int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF;
+/* Key exchange switch, default: on */
+int sysctl_tipc_key_exchange_enabled __read_mostly = 1;
-/**
+/*
* struct tipc_key - TIPC keys' status indicator
*
* 7 6 5 4 3 2 1 0
@@ -115,6 +123,8 @@ struct tipc_key {
/**
* struct tipc_tfm - TIPC TFM structure to form a list of TFMs
+ * @tfm: cipher handle/key
+ * @list: linked list of TFMs
*/
struct tipc_tfm {
struct crypto_aead *tfm;
@@ -130,8 +140,10 @@ struct tipc_tfm {
* @salt: the key's SALT value
* @authsize: authentication tag size (max = 16)
* @mode: crypto mode is applied to the key
- * @hint[]: a hint for user key
+ * @hint: a hint for user key
* @rcu: struct rcu_head
+ * @key: the aead key
+ * @gen: the key's generation
* @seqno: the key seqno (cluster scope)
* @refcnt: the key reference counter
*/
@@ -144,8 +156,10 @@ struct tipc_aead {
u32 salt;
u8 authsize;
u8 mode;
- char hint[TIPC_AEAD_HINT_LEN + 1];
+ char hint[2 * TIPC_AEAD_HINT_LEN + 1];
struct rcu_head rcu;
+ struct tipc_aead_key *key;
+ u16 gen;
atomic64_t seqno ____cacheline_aligned;
refcount_t refcnt ____cacheline_aligned;
@@ -154,6 +168,7 @@ struct tipc_aead {
/**
* struct tipc_crypto_stats - TIPC Crypto statistics
+ * @stat: array of crypto statistics
*/
struct tipc_crypto_stats {
unsigned int stat[MAX_STATS];
@@ -165,26 +180,57 @@ struct tipc_crypto_stats {
* @node: TIPC node (RX)
* @aead: array of pointers to AEAD keys for encryption/decryption
* @peer_rx_active: replicated peer RX active key index
+ * @key_gen: TX/RX key generation
* @key: the key states
- * @working: the crypto is working or not
+ * @skey_mode: session key's mode
+ * @skey: received session key
+ * @wq: common workqueue on TX crypto
+ * @work: delayed work sched for TX/RX
+ * @key_distr: key distributing state
+ * @rekeying_intv: rekeying interval (in minutes)
* @stats: the crypto statistics
+ * @name: the crypto name
* @sndnxt: the per-peer sndnxt (TX)
* @timer1: general timer 1 (jiffies)
- * @timer2: general timer 1 (jiffies)
+ * @timer2: general timer 2 (jiffies)
+ * @working: the crypto is working or not
+ * @key_master: flag indicates if master key exists
+ * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.)
+ * @nokey: no key indication
+ * @flags: combined flags field
* @lock: tipc_key lock
*/
struct tipc_crypto {
struct net *net;
struct tipc_node *node;
- struct tipc_aead __rcu *aead[KEY_MAX + 1]; /* key[0] is UNUSED */
+ struct tipc_aead __rcu *aead[KEY_MAX + 1];
atomic_t peer_rx_active;
+ u16 key_gen;
struct tipc_key key;
- u8 working:1;
+ u8 skey_mode;
+ struct tipc_aead_key *skey;
+ struct workqueue_struct *wq;
+ struct delayed_work work;
+#define KEY_DISTR_SCHED 1
+#define KEY_DISTR_COMPL 2
+ atomic_t key_distr;
+ u32 rekeying_intv;
+
struct tipc_crypto_stats __percpu *stats;
+ char name[48];
atomic64_t sndnxt ____cacheline_aligned;
unsigned long timer1;
unsigned long timer2;
+ union {
+ struct {
+ u8 working:1;
+ u8 key_master:1;
+ u8 legacy_user:1;
+ u8 nokey: 1;
+ };
+ u8 flags;
+ };
spinlock_t lock; /* crypto lock */
} ____cacheline_aligned;
@@ -234,23 +280,35 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c,
u8 new_active,
u8 new_pending);
static int tipc_crypto_key_attach(struct tipc_crypto *c,
- struct tipc_aead *aead, u8 pos);
+ struct tipc_aead *aead, u8 pos,
+ bool master_key);
static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending);
static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
struct tipc_crypto *rx,
- struct sk_buff *skb);
-static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active,
- struct tipc_msg *hdr);
+ struct sk_buff *skb,
+ u8 tx_key);
+static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb);
static int tipc_crypto_key_revoke(struct net *net, u8 tx_key);
+static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb,
+ struct tipc_bearer *b,
+ struct tipc_media_addr *dst,
+ struct tipc_node *__dnode, u8 type);
static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
struct tipc_bearer *b,
struct sk_buff **skb, int err);
static void tipc_crypto_do_cmd(struct net *net, int cmd);
static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf);
-#ifdef TIPC_CRYPTO_DEBUG
static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new,
char *buf);
-#endif
+static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey,
+ u16 gen, u8 mode, u32 dnode);
+static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr);
+static void tipc_crypto_work_tx(struct work_struct *work);
+static void tipc_crypto_work_rx(struct work_struct *work);
+static int tipc_aead_key_generate(struct tipc_aead_key *skey);
+
+#define is_tx(crypto) (!(crypto)->node)
+#define is_rx(crypto) (!is_tx(crypto))
#define key_next(cur) ((cur) % KEY_MAX + 1)
@@ -270,31 +328,58 @@ do { \
/**
* tipc_aead_key_validate - Validate a AEAD user key
+ * @ukey: pointer to user key data
+ * @info: netlink info pointer
*/
-int tipc_aead_key_validate(struct tipc_aead_key *ukey)
+int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info)
{
int keylen;
/* Check if algorithm exists */
if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) {
- pr_info("Not found cipher: \"%s\"!\n", ukey->alg_name);
+ GENL_SET_ERR_MSG(info, "unable to load the algorithm (module existed?)");
return -ENODEV;
}
/* Currently, we only support the "gcm(aes)" cipher algorithm */
- if (strcmp(ukey->alg_name, "gcm(aes)"))
+ if (strcmp(ukey->alg_name, "gcm(aes)")) {
+ GENL_SET_ERR_MSG(info, "not supported yet the algorithm");
return -ENOTSUPP;
+ }
/* Check if key size is correct */
keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE;
if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 &&
keylen != TIPC_AES_GCM_KEY_SIZE_192 &&
- keylen != TIPC_AES_GCM_KEY_SIZE_256))
- return -EINVAL;
+ keylen != TIPC_AES_GCM_KEY_SIZE_256)) {
+ GENL_SET_ERR_MSG(info, "incorrect key length (20, 28 or 36 octets?)");
+ return -EKEYREJECTED;
+ }
return 0;
}
+/**
+ * tipc_aead_key_generate - Generate new session key
+ * @skey: input/output key with new content
+ *
+ * Return: 0 in case of success, otherwise < 0
+ */
+static int tipc_aead_key_generate(struct tipc_aead_key *skey)
+{
+ int rc = 0;
+
+ /* Fill the key's content with a random value via RNG cipher */
+ rc = crypto_get_default_rng();
+ if (likely(!rc)) {
+ rc = crypto_rng_get_bytes(crypto_default_rng, skey->key,
+ skey->keylen);
+ crypto_put_default_rng();
+ }
+
+ return rc;
+}
+
static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead)
{
struct tipc_aead *tmp;
@@ -339,6 +424,7 @@ static void tipc_aead_free(struct rcu_head *rp)
kfree(head);
}
free_percpu(aead->tfm_entry);
+ kfree_sensitive(aead->key);
kfree(aead);
}
@@ -397,6 +483,7 @@ static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val)
/**
* tipc_aead_tfm_next - Move TFM entry to the next one in list and return it
+ * @aead: the AEAD key pointer
*/
static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead)
{
@@ -501,14 +588,15 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey,
return err;
}
- /* Copy some chars from the user key as a hint */
- memcpy(tmp->hint, ukey->key, TIPC_AEAD_HINT_LEN);
- tmp->hint[TIPC_AEAD_HINT_LEN] = '\0';
+ /* Form a hex string of some last bytes as the key's hint */
+ bin2hex(tmp->hint, ukey->key + keylen - TIPC_AEAD_HINT_LEN,
+ TIPC_AEAD_HINT_LEN);
/* Initialize the other data */
tmp->mode = mode;
tmp->cloned = NULL;
tmp->authsize = TIPC_AES_GCM_TAG_SIZE;
+ tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL);
memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE);
atomic_set(&tmp->users, 0);
atomic64_set(&tmp->seqno, 0);
@@ -633,9 +721,9 @@ static void *tipc_aead_mem_alloc(struct crypto_aead *tfm,
* @__dnode: TIPC dest node if "known"
*
* Return:
- * 0 : if the encryption has completed
- * -EINPROGRESS/-EBUSY : if a callback will be performed
- * < 0 : the encryption has failed
+ * * 0 : if the encryption has completed
+ * * -EINPROGRESS/-EBUSY : if a callback will be performed
+ * * < 0 : the encryption has failed
*/
static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb,
struct tipc_bearer *b,
@@ -663,13 +751,11 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb,
* but there is no frag_list, it should be still fine!
* Otherwise, we must cow it to be a writable buffer with the tailroom.
*/
-#ifdef TIPC_CRYPTO_DEBUG
SKB_LINEAR_ASSERT(skb);
if (tailen > skb_tailroom(skb)) {
- pr_warn("TX: skb tailroom is not enough: %d, requires: %d\n",
- skb_tailroom(skb), tailen);
+ pr_debug("TX(): skb tailroom is not enough: %d, requires: %d\n",
+ skb_tailroom(skb), tailen);
}
-#endif
if (unlikely(!skb_cloned(skb) && tailen <= skb_tailroom(skb))) {
nsg = 1;
@@ -791,9 +877,9 @@ static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err)
* @b: TIPC bearer where the message has been received
*
* Return:
- * 0 : if the decryption has completed
- * -EINPROGRESS/-EBUSY : if a callback will be performed
- * < 0 : the decryption has failed
+ * * 0 : if the decryption has completed
+ * * -EINPROGRESS/-EBUSY : if a callback will be performed
+ * * < 0 : the decryption has failed
*/
static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead,
struct sk_buff *skb, struct tipc_bearer *b)
@@ -922,7 +1008,7 @@ static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr)
* tipc_ehdr_validate - Validate an encryption message
* @skb: the message buffer
*
- * Returns "true" if this is a valid encryption message, otherwise "false"
+ * Return: "true" if this is a valid encryption message, otherwise "false"
*/
bool tipc_ehdr_validate(struct sk_buff *skb)
{
@@ -940,8 +1026,6 @@ bool tipc_ehdr_validate(struct sk_buff *skb)
return false;
if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE))
return false;
- if (unlikely(!ehdr->tx_key))
- return false;
return true;
}
@@ -994,6 +1078,8 @@ static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead,
ehdr->tx_key = tx_key;
ehdr->destined = (__rx) ? 1 : 0;
ehdr->rx_key_active = (__rx) ? __rx->key.active : 0;
+ ehdr->rx_nokey = (__rx) ? __rx->nokey : 0;
+ ehdr->master_key = aead->crypto->key_master;
ehdr->reserved_1 = 0;
ehdr->reserved_2 = 0;
@@ -1019,23 +1105,16 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c,
u8 new_active,
u8 new_pending)
{
-#ifdef TIPC_CRYPTO_DEBUG
struct tipc_key old = c->key;
char buf[32];
-#endif
c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) |
((new_active & KEY_MASK) << (KEY_BITS)) |
((new_pending & KEY_MASK));
-#ifdef TIPC_CRYPTO_DEBUG
- pr_info("%s(%s): key changing %s ::%pS\n",
- (c->node) ? "RX" : "TX",
- (c->node) ? tipc_node_get_id_str(c->node) :
- tipc_own_id_string(c->net),
- tipc_key_change_dump(old, c->key, buf),
- __builtin_return_address(0));
-#endif
+ pr_debug("%s: key changing %s ::%pS\n", c->name,
+ tipc_key_change_dump(old, c->key, buf),
+ __builtin_return_address(0));
}
/**
@@ -1043,6 +1122,7 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c,
* @c: TIPC crypto to which new key is attached
* @ukey: the user key
* @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY)
+ * @master_key: specify this is a cluster master key
*
* A new TIPC AEAD key will be allocated and initiated with the specified user
* key, then attached to the TIPC crypto.
@@ -1050,7 +1130,7 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c,
* Return: new key id in case of success, otherwise: < 0
*/
int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey,
- u8 mode)
+ u8 mode, bool master_key)
{
struct tipc_aead *aead = NULL;
int rc = 0;
@@ -1060,17 +1140,11 @@ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey,
/* Attach it to the crypto */
if (likely(!rc)) {
- rc = tipc_crypto_key_attach(c, aead, 0);
+ rc = tipc_crypto_key_attach(c, aead, 0, master_key);
if (rc < 0)
tipc_aead_free(&aead->rcu);
}
- pr_info("%s(%s): key initiating, rc %d!\n",
- (c->node) ? "RX" : "TX",
- (c->node) ? tipc_node_get_id_str(c->node) :
- tipc_own_id_string(c->net),
- rc);
-
return rc;
}
@@ -1079,58 +1153,58 @@ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey,
* @c: TIPC crypto to which the new AEAD key is attached
* @aead: the new AEAD key pointer
* @pos: desired slot in the crypto key array, = 0 if any!
+ * @master_key: specify this is a cluster master key
*
* Return: new key id in case of success, otherwise: -EBUSY
*/
static int tipc_crypto_key_attach(struct tipc_crypto *c,
- struct tipc_aead *aead, u8 pos)
+ struct tipc_aead *aead, u8 pos,
+ bool master_key)
{
- u8 new_pending, new_passive, new_key;
struct tipc_key key;
int rc = -EBUSY;
+ u8 new_key;
spin_lock_bh(&c->lock);
key = c->key;
+ if (master_key) {
+ new_key = KEY_MASTER;
+ goto attach;
+ }
if (key.active && key.passive)
goto exit;
- if (key.passive && !tipc_aead_users(c->aead[key.passive]))
- goto exit;
if (key.pending) {
- if (pos)
- goto exit;
if (tipc_aead_users(c->aead[key.pending]) > 0)
goto exit;
+ /* if (pos): ok with replacing, will be aligned when needed */
/* Replace it */
- new_pending = key.pending;
- new_passive = key.passive;
- new_key = new_pending;
+ new_key = key.pending;
} else {
if (pos) {
if (key.active && pos != key_next(key.active)) {
- new_pending = key.pending;
- new_passive = pos;
- new_key = new_passive;
+ key.passive = pos;
+ new_key = pos;
goto attach;
} else if (!key.active && !key.passive) {
- new_pending = pos;
- new_passive = key.passive;
- new_key = new_pending;
+ key.pending = pos;
+ new_key = pos;
goto attach;
}
}
- new_pending = key_next(key.active ?: key.passive);
- new_passive = key.passive;
- new_key = new_pending;
+ key.pending = key_next(key.active ?: key.passive);
+ new_key = key.pending;
}
attach:
aead->crypto = c;
- tipc_crypto_key_set_state(c, new_passive, key.active, new_pending);
+ aead->gen = (is_tx(c)) ? ++c->key_gen : c->key_gen;
tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock);
-
+ if (likely(c->key.keys != key.keys))
+ tipc_crypto_key_set_state(c, key.passive, key.active,
+ key.pending);
c->working = 1;
- c->timer1 = jiffies;
- c->timer2 = jiffies;
+ c->nokey = 0;
+ c->key_master |= master_key;
rc = new_key;
exit:
@@ -1140,14 +1214,33 @@ exit:
void tipc_crypto_key_flush(struct tipc_crypto *c)
{
+ struct tipc_crypto *tx, *rx;
int k;
spin_lock_bh(&c->lock);
- c->working = 0;
+ if (is_rx(c)) {
+ /* Try to cancel pending work */
+ rx = c;
+ tx = tipc_net(rx->net)->crypto_tx;
+ if (cancel_delayed_work(&rx->work)) {
+ kfree(rx->skey);
+ rx->skey = NULL;
+ atomic_xchg(&rx->key_distr, 0);
+ tipc_node_put(rx->node);
+ }
+ /* RX stopping => decrease TX key users if any */
+ k = atomic_xchg(&rx->peer_rx_active, 0);
+ if (k) {
+ tipc_aead_users_dec(tx->aead[k], 0);
+ /* Mark the point TX key users changed */
+ tx->timer1 = jiffies;
+ }
+ }
+
+ c->flags = 0;
tipc_crypto_key_set_state(c, 0, 0, 0);
for (k = KEY_MIN; k <= KEY_MAX; k++)
tipc_crypto_key_detach(c->aead[k], &c->lock);
- atomic_set(&c->peer_rx_active, 0);
atomic64_set(&c->sndnxt, 0);
spin_unlock_bh(&c->lock);
}
@@ -1206,7 +1299,8 @@ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending)
rcu_assign_pointer(rx->aead[new_passive], tmp2);
refcount_set(&tmp1->refcnt, 1);
aligned = true;
- pr_info("RX(%s): key is aligned!\n", tipc_node_get_id_str(rx->node));
+ pr_info_ratelimited("%s: key[%d] -> key[%d]\n", rx->name, key.pending,
+ new_pending);
exit:
spin_unlock(&rx->lock);
@@ -1218,6 +1312,7 @@ exit:
* @tx: TX crypto handle
* @rx: RX crypto handle (can be NULL)
* @skb: the message skb which will be decrypted later
+ * @tx_key: peer TX key id
*
* This function looks up the existing TX keys and pick one which is suitable
* for the message decryption, that must be a cluster key and not used before
@@ -1227,7 +1322,8 @@ exit:
*/
static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
struct tipc_crypto *rx,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u8 tx_key)
{
struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb);
struct tipc_aead *aead = NULL;
@@ -1246,6 +1342,10 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
/* Pick one TX key */
spin_lock(&tx->lock);
+ if (tx_key == KEY_MASTER) {
+ aead = tipc_aead_rcu_ptr(tx->aead[KEY_MASTER], &tx->lock);
+ goto done;
+ }
do {
k = (i == 0) ? key.pending :
((i == 1) ? key.active : key.passive);
@@ -1265,9 +1365,12 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
skb->next = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!skb->next))
pr_warn("Failed to clone skb for next round if any\n");
- WARN_ON(!refcount_inc_not_zero(&aead->refcnt));
break;
} while (++i < 3);
+
+done:
+ if (likely(aead))
+ WARN_ON(!refcount_inc_not_zero(&aead->refcnt));
spin_unlock(&tx->lock);
return aead;
@@ -1276,53 +1379,73 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
/**
* tipc_crypto_key_synch: Synch own key data according to peer key status
* @rx: RX crypto handle
- * @new_rx_active: latest RX active key from peer
- * @hdr: TIPCv2 message
+ * @skb: TIPCv2 message buffer (incl. the ehdr from peer)
*
* This function updates the peer node related data as the peer RX active key
* has changed, so the number of TX keys' users on this node are increased and
* decreased correspondingly.
*
+ * It also considers if peer has no key, then we need to make own master key
+ * (if any) taking over i.e. starting grace period and also trigger key
+ * distributing process.
+ *
* The "per-peer" sndnxt is also reset when the peer key has switched.
*/
-static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active,
- struct tipc_msg *hdr)
+static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb)
{
- struct net *net = rx->net;
- struct tipc_crypto *tx = tipc_net(net)->crypto_tx;
- u8 cur_rx_active;
+ struct tipc_ehdr *ehdr = (struct tipc_ehdr *)skb_network_header(skb);
+ struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx;
+ struct tipc_msg *hdr = buf_msg(skb);
+ u32 self = tipc_own_addr(rx->net);
+ u8 cur, new;
+ unsigned long delay;
- /* TX might be even not ready yet */
- if (unlikely(!tx->key.active && !tx->key.pending))
- return;
+ /* Update RX 'key_master' flag according to peer, also mark "legacy" if
+ * a peer has no master key.
+ */
+ rx->key_master = ehdr->master_key;
+ if (!rx->key_master)
+ tx->legacy_user = 1;
- cur_rx_active = atomic_read(&rx->peer_rx_active);
- if (likely(cur_rx_active == new_rx_active))
+ /* For later cases, apply only if message is destined to this node */
+ if (!ehdr->destined || msg_short(hdr) || msg_destnode(hdr) != self)
return;
- /* Make sure this message destined for this node */
- if (unlikely(msg_short(hdr) ||
- msg_destnode(hdr) != tipc_own_addr(net)))
- return;
+ /* Case 1: Peer has no keys, let's make master key take over */
+ if (ehdr->rx_nokey) {
+ /* Set or extend grace period */
+ tx->timer2 = jiffies;
+ /* Schedule key distributing for the peer if not yet */
+ if (tx->key.keys &&
+ !atomic_cmpxchg(&rx->key_distr, 0, KEY_DISTR_SCHED)) {
+ get_random_bytes(&delay, 2);
+ delay %= 5;
+ delay = msecs_to_jiffies(500 * ++delay);
+ if (queue_delayed_work(tx->wq, &rx->work, delay))
+ tipc_node_get(rx->node);
+ }
+ } else {
+ /* Cancel a pending key distributing if any */
+ atomic_xchg(&rx->key_distr, 0);
+ }
- /* Peer RX active key has changed, try to update owns' & TX users */
- if (atomic_cmpxchg(&rx->peer_rx_active,
- cur_rx_active,
- new_rx_active) == cur_rx_active) {
- if (new_rx_active)
- tipc_aead_users_inc(tx->aead[new_rx_active], INT_MAX);
- if (cur_rx_active)
- tipc_aead_users_dec(tx->aead[cur_rx_active], 0);
+ /* Case 2: Peer RX active key has changed, let's update own TX users */
+ cur = atomic_read(&rx->peer_rx_active);
+ new = ehdr->rx_key_active;
+ if (tx->key.keys &&
+ cur != new &&
+ atomic_cmpxchg(&rx->peer_rx_active, cur, new) == cur) {
+ if (new)
+ tipc_aead_users_inc(tx->aead[new], INT_MAX);
+ if (cur)
+ tipc_aead_users_dec(tx->aead[cur], 0);
atomic64_set(&rx->sndnxt, 0);
/* Mark the point TX key users changed */
tx->timer1 = jiffies;
-#ifdef TIPC_CRYPTO_DEBUG
- pr_info("TX(%s): key users changed %d-- %d++, peer RX(%s)\n",
- tipc_own_id_string(net), cur_rx_active,
- new_rx_active, tipc_node_get_id_str(rx->node));
-#endif
+ pr_debug("%s: key users changed %d-- %d++, peer %s\n",
+ tx->name, cur, new, rx->name);
}
}
@@ -1340,7 +1463,7 @@ static int tipc_crypto_key_revoke(struct net *net, u8 tx_key)
tipc_crypto_key_detach(tx->aead[key.active], &tx->lock);
spin_unlock(&tx->lock);
- pr_warn("TX(%s): key is revoked!\n", tipc_own_id_string(net));
+ pr_warn("%s: key is revoked\n", tx->name);
return -EKEYREVOKED;
}
@@ -1357,6 +1480,15 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net,
if (!c)
return -ENOMEM;
+ /* Allocate workqueue on TX */
+ if (!node) {
+ c->wq = alloc_ordered_workqueue("tipc_crypto", 0);
+ if (!c->wq) {
+ kfree(c);
+ return -ENOMEM;
+ }
+ }
+
/* Allocate statistic structure */
c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC);
if (!c->stats) {
@@ -1364,53 +1496,52 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net,
return -ENOMEM;
}
- c->working = 0;
+ c->flags = 0;
c->net = net;
c->node = node;
+ get_random_bytes(&c->key_gen, 2);
tipc_crypto_key_set_state(c, 0, 0, 0);
+ atomic_set(&c->key_distr, 0);
atomic_set(&c->peer_rx_active, 0);
atomic64_set(&c->sndnxt, 0);
c->timer1 = jiffies;
c->timer2 = jiffies;
+ c->rekeying_intv = TIPC_REKEYING_INTV_DEF;
spin_lock_init(&c->lock);
- *crypto = c;
+ scnprintf(c->name, 48, "%s(%s)", (is_rx(c)) ? "RX" : "TX",
+ (is_rx(c)) ? tipc_node_get_id_str(c->node) :
+ tipc_own_id_string(c->net));
+
+ if (is_rx(c))
+ INIT_DELAYED_WORK(&c->work, tipc_crypto_work_rx);
+ else
+ INIT_DELAYED_WORK(&c->work, tipc_crypto_work_tx);
+ *crypto = c;
return 0;
}
void tipc_crypto_stop(struct tipc_crypto **crypto)
{
- struct tipc_crypto *c, *tx, *rx;
- bool is_rx;
+ struct tipc_crypto *c = *crypto;
u8 k;
- if (!*crypto)
+ if (!c)
return;
- rcu_read_lock();
- /* RX stopping? => decrease TX key users if any */
- is_rx = !!((*crypto)->node);
- if (is_rx) {
- rx = *crypto;
- tx = tipc_net(rx->net)->crypto_tx;
- k = atomic_read(&rx->peer_rx_active);
- if (k) {
- tipc_aead_users_dec(tx->aead[k], 0);
- /* Mark the point TX key users changed */
- tx->timer1 = jiffies;
- }
+ /* Flush any queued works & destroy wq */
+ if (is_tx(c)) {
+ c->rekeying_intv = 0;
+ cancel_delayed_work_sync(&c->work);
+ destroy_workqueue(c->wq);
}
/* Release AEAD keys */
- c = *crypto;
+ rcu_read_lock();
for (k = KEY_MIN; k <= KEY_MAX; k++)
tipc_aead_put(rcu_dereference(c->aead[k]));
rcu_read_unlock();
-
- pr_warn("%s(%s) has been purged, node left!\n",
- (is_rx) ? "RX" : "TX",
- (is_rx) ? tipc_node_get_id_str((*crypto)->node) :
- tipc_own_id_string((*crypto)->net));
+ pr_debug("%s: has been stopped\n", c->name);
/* Free this crypto statistics */
free_percpu(c->stats);
@@ -1424,106 +1555,91 @@ void tipc_crypto_timeout(struct tipc_crypto *rx)
struct tipc_net *tn = tipc_net(rx->net);
struct tipc_crypto *tx = tn->crypto_tx;
struct tipc_key key;
- u8 new_pending, new_passive;
int cmd;
- /* TX key activating:
- * The pending key (users > 0) -> active
- * The active key if any (users == 0) -> free
- */
+ /* TX pending: taking all users & stable -> active */
spin_lock(&tx->lock);
key = tx->key;
if (key.active && tipc_aead_users(tx->aead[key.active]) > 0)
goto s1;
if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0)
goto s1;
- if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_LIM))
+ if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_TIME))
goto s1;
tipc_crypto_key_set_state(tx, key.passive, key.pending, 0);
if (key.active)
tipc_crypto_key_detach(tx->aead[key.active], &tx->lock);
this_cpu_inc(tx->stats->stat[STAT_SWITCHES]);
- pr_info("TX(%s): key %d is activated!\n", tipc_own_id_string(tx->net),
- key.pending);
+ pr_info("%s: key[%d] is activated\n", tx->name, key.pending);
s1:
spin_unlock(&tx->lock);
- /* RX key activating:
- * The pending key (users > 0) -> active
- * The active key if any -> passive, freed later
- */
+ /* RX pending: having user -> active */
spin_lock(&rx->lock);
key = rx->key;
if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0)
goto s2;
- new_pending = (key.passive &&
- !tipc_aead_users(rx->aead[key.passive])) ?
- key.passive : 0;
- new_passive = (key.active) ?: ((new_pending) ? 0 : key.passive);
- tipc_crypto_key_set_state(rx, new_passive, key.pending, new_pending);
+ if (key.active)
+ key.passive = key.active;
+ key.active = key.pending;
+ rx->timer2 = jiffies;
+ tipc_crypto_key_set_state(rx, key.passive, key.active, 0);
this_cpu_inc(rx->stats->stat[STAT_SWITCHES]);
- pr_info("RX(%s): key %d is activated!\n",
- tipc_node_get_id_str(rx->node), key.pending);
+ pr_info("%s: key[%d] is activated\n", rx->name, key.pending);
goto s5;
s2:
- /* RX key "faulty" switching:
- * The faulty pending key (users < -30) -> passive
- * The passive key (users = 0) -> pending
- * Note: This only happens after RX deactivated - s3!
- */
- key = rx->key;
- if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -30)
- goto s3;
- if (!key.passive || tipc_aead_users(rx->aead[key.passive]) != 0)
+ /* RX pending: not working -> remove */
+ if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -10)
goto s3;
- new_pending = key.passive;
- new_passive = key.pending;
- tipc_crypto_key_set_state(rx, new_passive, key.active, new_pending);
+ tipc_crypto_key_set_state(rx, key.passive, key.active, 0);
+ tipc_crypto_key_detach(rx->aead[key.pending], &rx->lock);
+ pr_debug("%s: key[%d] is removed\n", rx->name, key.pending);
goto s5;
s3:
- /* RX key deactivating:
- * The passive key if any -> pending
- * The active key -> passive (users = 0) / pending
- * The pending key if any -> passive (users = 0)
- */
- key = rx->key;
+ /* RX active: timed out or no user -> pending */
if (!key.active)
goto s4;
- if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM))
+ if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM) &&
+ tipc_aead_users(rx->aead[key.active]) > 0)
goto s4;
- new_pending = (key.passive) ?: key.active;
- new_passive = (key.passive) ? key.active : key.pending;
- tipc_aead_users_set(rx->aead[new_pending], 0);
- if (new_passive)
- tipc_aead_users_set(rx->aead[new_passive], 0);
- tipc_crypto_key_set_state(rx, new_passive, 0, new_pending);
- pr_info("RX(%s): key %d is deactivated!\n",
- tipc_node_get_id_str(rx->node), key.active);
+ if (key.pending)
+ key.passive = key.active;
+ else
+ key.pending = key.active;
+ rx->timer2 = jiffies;
+ tipc_crypto_key_set_state(rx, key.passive, 0, key.pending);
+ tipc_aead_users_set(rx->aead[key.pending], 0);
+ pr_debug("%s: key[%d] is deactivated\n", rx->name, key.active);
goto s5;
s4:
- /* RX key passive -> freed: */
- key = rx->key;
- if (!key.passive || !tipc_aead_users(rx->aead[key.passive]))
+ /* RX passive: outdated or not working -> free */
+ if (!key.passive)
goto s5;
- if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM))
+ if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM) &&
+ tipc_aead_users(rx->aead[key.passive]) > -10)
goto s5;
tipc_crypto_key_set_state(rx, 0, key.active, key.pending);
tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock);
- pr_info("RX(%s): key %d is freed!\n", tipc_node_get_id_str(rx->node),
- key.passive);
+ pr_debug("%s: key[%d] is freed\n", rx->name, key.passive);
s5:
spin_unlock(&rx->lock);
+ /* Relax it here, the flag will be set again if it really is, but only
+ * when we are not in grace period for safety!
+ */
+ if (time_after(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD))
+ tx->legacy_user = 0;
+
/* Limit max_tfms & do debug commands if needed */
if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM))
return;
@@ -1533,6 +1649,22 @@ s5:
tipc_crypto_do_cmd(rx->net, cmd);
}
+static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb,
+ struct tipc_bearer *b,
+ struct tipc_media_addr *dst,
+ struct tipc_node *__dnode, u8 type)
+{
+ struct sk_buff *skb;
+
+ skb = skb_clone(_skb, GFP_ATOMIC);
+ if (skb) {
+ TIPC_SKB_CB(skb)->xmit_type = type;
+ tipc_crypto_xmit(net, &skb, b, dst, __dnode);
+ if (skb)
+ b->media->send_msg(net, skb, b, dst);
+ }
+}
+
/**
* tipc_crypto_xmit - Build & encrypt TIPC message for xmit
* @net: struct net
@@ -1542,18 +1674,19 @@ s5:
* @__dnode: destination node for reference if any
*
* First, build an encryption message header on the top of the message, then
- * encrypt the original TIPC message by using the active or pending TX key.
+ * encrypt the original TIPC message by using the pending, master or active
+ * key with this preference order.
* If the encryption is successful, the encrypted skb is returned directly or
* via the callback.
* Otherwise, the skb is freed!
*
* Return:
- * 0 : the encryption has succeeded (or no encryption)
- * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made
- * -ENOKEK : the encryption has failed due to no key
- * -EKEYREVOKED : the encryption has failed due to key revoked
- * -ENOMEM : the encryption has failed due to no memory
- * < 0 : the encryption has failed due to other reasons
+ * * 0 : the encryption has succeeded (or no encryption)
+ * * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made
+ * * -ENOKEK : the encryption has failed due to no key
+ * * -EKEYREVOKED : the encryption has failed due to key revoked
+ * * -ENOMEM : the encryption has failed due to no memory
+ * * < 0 : the encryption has failed due to other reasons
*/
int tipc_crypto_xmit(struct net *net, struct sk_buff **skb,
struct tipc_bearer *b, struct tipc_media_addr *dst,
@@ -1562,46 +1695,67 @@ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb,
struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode);
struct tipc_crypto *tx = tipc_net(net)->crypto_tx;
struct tipc_crypto_stats __percpu *stats = tx->stats;
+ struct tipc_msg *hdr = buf_msg(*skb);
struct tipc_key key = tx->key;
struct tipc_aead *aead = NULL;
- struct sk_buff *probe;
+ u32 user = msg_user(hdr);
+ u32 type = msg_type(hdr);
int rc = -ENOKEY;
- u8 tx_key;
+ u8 tx_key = 0;
/* No encryption? */
if (!tx->working)
return 0;
- /* Try with the pending key if available and:
- * 1) This is the only choice (i.e. no active key) or;
- * 2) Peer has switched to this key (unicast only) or;
- * 3) It is time to do a pending key probe;
- */
+ /* Pending key if peer has active on it or probing time */
if (unlikely(key.pending)) {
tx_key = key.pending;
- if (!key.active)
+ if (!tx->key_master && !key.active)
goto encrypt;
if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key)
goto encrypt;
- if (TIPC_SKB_CB(*skb)->probe)
+ if (TIPC_SKB_CB(*skb)->xmit_type == SKB_PROBING) {
+ pr_debug("%s: probing for key[%d]\n", tx->name,
+ key.pending);
+ goto encrypt;
+ }
+ if (user == LINK_CONFIG || user == LINK_PROTOCOL)
+ tipc_crypto_clone_msg(net, *skb, b, dst, __dnode,
+ SKB_PROBING);
+ }
+
+ /* Master key if this is a *vital* message or in grace period */
+ if (tx->key_master) {
+ tx_key = KEY_MASTER;
+ if (!key.active)
+ goto encrypt;
+ if (TIPC_SKB_CB(*skb)->xmit_type == SKB_GRACING) {
+ pr_debug("%s: gracing for msg (%d %d)\n", tx->name,
+ user, type);
goto encrypt;
- if (!__rx &&
- time_after(jiffies, tx->timer2 + TIPC_TX_PROBE_LIM)) {
- tx->timer2 = jiffies;
- probe = skb_clone(*skb, GFP_ATOMIC);
- if (probe) {
- TIPC_SKB_CB(probe)->probe = 1;
- tipc_crypto_xmit(net, &probe, b, dst, __dnode);
- if (probe)
- b->media->send_msg(net, probe, b, dst);
+ }
+ if (user == LINK_CONFIG ||
+ (user == LINK_PROTOCOL && type == RESET_MSG) ||
+ (user == MSG_CRYPTO && type == KEY_DISTR_MSG) ||
+ time_before(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) {
+ if (__rx && __rx->key_master &&
+ !atomic_read(&__rx->peer_rx_active))
+ goto encrypt;
+ if (!__rx) {
+ if (likely(!tx->legacy_user))
+ goto encrypt;
+ tipc_crypto_clone_msg(net, *skb, b, dst,
+ __dnode, SKB_GRACING);
}
}
}
+
/* Else, use the active key if any */
if (likely(key.active)) {
tx_key = key.active;
goto encrypt;
}
+
goto exit;
encrypt:
@@ -1652,12 +1806,12 @@ exit:
* cluster key(s) can be taken for decryption (- recursive).
*
* Return:
- * 0 : the decryption has successfully completed
- * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made
- * -ENOKEY : the decryption has failed due to no key
- * -EBADMSG : the decryption has failed due to bad message
- * -ENOMEM : the decryption has failed due to no memory
- * < 0 : the decryption has failed due to other reasons
+ * * 0 : the decryption has successfully completed
+ * * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made
+ * * -ENOKEY : the decryption has failed due to no key
+ * * -EBADMSG : the decryption has failed due to bad message
+ * * -ENOMEM : the decryption has failed due to no memory
+ * * < 0 : the decryption has failed due to other reasons
*/
int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx,
struct sk_buff **skb, struct tipc_bearer *b)
@@ -1667,30 +1821,21 @@ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx,
struct tipc_aead *aead = NULL;
struct tipc_key key;
int rc = -ENOKEY;
- u8 tx_key = 0;
+ u8 tx_key, n;
+
+ tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key;
/* New peer?
* Let's try with TX key (i.e. cluster mode) & verify the skb first!
*/
- if (unlikely(!rx))
+ if (unlikely(!rx || tx_key == KEY_MASTER))
goto pick_tx;
- /* Pick RX key according to TX key, three cases are possible:
- * 1) The current active key (likely) or;
- * 2) The pending (new or deactivated) key (if any) or;
- * 3) The passive or old active key (i.e. users > 0);
- */
- tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key;
+ /* Pick RX key according to TX key if any */
key = rx->key;
- if (likely(tx_key == key.active))
+ if (tx_key == key.active || tx_key == key.pending ||
+ tx_key == key.passive)
goto decrypt;
- if (tx_key == key.pending)
- goto decrypt;
- if (tx_key == key.passive) {
- rx->timer2 = jiffies;
- if (tipc_aead_users(rx->aead[key.passive]) > 0)
- goto decrypt;
- }
/* Unknown key, let's try to align RX key(s) */
if (tipc_crypto_key_try_align(rx, tx_key))
@@ -1698,7 +1843,7 @@ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx,
pick_tx:
/* No key suitable? Try to pick one from TX... */
- aead = tipc_crypto_key_pick_tx(tx, rx, *skb);
+ aead = tipc_crypto_key_pick_tx(tx, rx, *skb, tx_key);
if (aead)
goto decrypt;
goto exit;
@@ -1726,8 +1871,19 @@ exit:
if (rc == -ENOKEY) {
kfree_skb(*skb);
*skb = NULL;
- if (rx)
+ if (rx) {
+ /* Mark rx->nokey only if we dont have a
+ * pending received session key, nor a newer
+ * one i.e. in the next slot.
+ */
+ n = key_next(tx_key);
+ rx->nokey = !(rx->skey ||
+ rcu_access_pointer(rx->aead[n]));
+ pr_debug_ratelimited("%s: nokey %d, key %d/%x\n",
+ rx->name, rx->nokey,
+ tx_key, rx->key.keys);
tipc_node_put(rx->node);
+ }
this_cpu_inc(stats->stat[STAT_NOKEYS]);
return rc;
} else if (rc == -EBADMSG) {
@@ -1749,21 +1905,17 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
struct tipc_aead *tmp = NULL;
struct tipc_ehdr *ehdr;
struct tipc_node *n;
- u8 rx_key_active;
- bool destined;
/* Is this completed by TX? */
- if (unlikely(!rx->node)) {
+ if (unlikely(is_tx(aead->crypto))) {
rx = skb_cb->tx_clone_ctx.rx;
-#ifdef TIPC_CRYPTO_DEBUG
- pr_info("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n",
- (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead,
- (*skb)->next, skb_cb->flags);
- pr_info("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n",
- skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last,
- aead->crypto->aead[1], aead->crypto->aead[2],
- aead->crypto->aead[3]);
-#endif
+ pr_debug("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n",
+ (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead,
+ (*skb)->next, skb_cb->flags);
+ pr_debug("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n",
+ skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last,
+ aead->crypto->aead[1], aead->crypto->aead[2],
+ aead->crypto->aead[3]);
if (unlikely(err)) {
if (err == -EBADMSG && (*skb)->next)
tipc_rcv(net, (*skb)->next, b);
@@ -1784,12 +1936,12 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
goto free_skb;
}
- /* Skip cloning this time as we had a RX pending key */
- if (rx->key.pending)
+ /* Ignore cloning if it was TX master key */
+ if (ehdr->tx_key == KEY_MASTER)
goto rcv;
if (tipc_aead_clone(&tmp, aead) < 0)
goto rcv;
- if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key) < 0) {
+ if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key, false) < 0) {
tipc_aead_free(&tmp->rcu);
goto rcv;
}
@@ -1805,14 +1957,18 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
/* Set the RX key's user */
tipc_aead_users_set(aead, 1);
-rcv:
/* Mark this point, RX works */
rx->timer1 = jiffies;
+rcv:
/* Remove ehdr & auth. tag prior to tipc_rcv() */
ehdr = (struct tipc_ehdr *)(*skb)->data;
- destined = ehdr->destined;
- rx_key_active = ehdr->rx_key_active;
+
+ /* Mark this point, RX passive still works */
+ if (rx->key.passive && ehdr->tx_key == rx->key.passive)
+ rx->timer2 = jiffies;
+
+ skb_reset_network_header(*skb);
skb_pull(*skb, tipc_ehdr_size(ehdr));
pskb_trim(*skb, (*skb)->len - aead->authsize);
@@ -1822,9 +1978,8 @@ rcv:
goto free_skb;
}
- /* Update peer RX active key & TX users */
- if (destined)
- tipc_crypto_key_synch(rx, rx_key_active, buf_msg(*skb));
+ /* Ok, everything's fine, try to synch own keys according to peers' */
+ tipc_crypto_key_synch(rx, *skb);
/* Mark skb decrypted */
skb_cb->decrypted = 1;
@@ -1883,7 +2038,7 @@ print_stats:
/* Print crypto statistics */
for (i = 0, j = 0; i < MAX_STATS; i++)
j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]);
- pr_info("\nCounter %s", buf);
+ pr_info("Counter %s", buf);
memset(buf, '-', 115);
buf[115] = '\0';
@@ -1927,21 +2082,31 @@ static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf)
char *s;
for (k = KEY_MIN; k <= KEY_MAX; k++) {
- if (k == key.passive)
- s = "PAS";
- else if (k == key.active)
- s = "ACT";
- else if (k == key.pending)
- s = "PEN";
- else
- s = "-";
+ if (k == KEY_MASTER) {
+ if (is_rx(c))
+ continue;
+ if (time_before(jiffies,
+ c->timer2 + TIPC_TX_GRACE_PERIOD))
+ s = "ACT";
+ else
+ s = "PAS";
+ } else {
+ if (k == key.passive)
+ s = "PAS";
+ else if (k == key.active)
+ s = "ACT";
+ else if (k == key.pending)
+ s = "PEN";
+ else
+ s = "-";
+ }
i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s);
rcu_read_lock();
aead = rcu_dereference(c->aead[k]);
if (aead)
i += scnprintf(buf + i, 200 - i,
- "{\"%s...\", \"%s\"}/%d:%d",
+ "{\"0x...%s\", \"%s\"}/%d:%d",
aead->hint,
(aead->mode == CLUSTER_KEY) ? "c" : "p",
atomic_read(&aead->users),
@@ -1950,14 +2115,13 @@ static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf)
i += scnprintf(buf + i, 200 - i, "\n");
}
- if (c->node)
+ if (is_rx(c))
i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n",
atomic_read(&c->peer_rx_active));
return buf;
}
-#ifdef TIPC_CRYPTO_DEBUG
static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new,
char *buf)
{
@@ -1968,7 +2132,7 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new,
/* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */
again:
i += scnprintf(buf + i, 32 - i, "[");
- for (k = KEY_MIN; k <= KEY_MAX; k++) {
+ for (k = KEY_1; k <= KEY_3; k++) {
if (k == key->passive)
s = "pas";
else if (k == key->active)
@@ -1978,7 +2142,7 @@ again:
else
s = "-";
i += scnprintf(buf + i, 32 - i,
- (k != KEY_MAX) ? "%s " : "%s", s);
+ (k != KEY_3) ? "%s " : "%s", s);
}
if (key != &new) {
i += scnprintf(buf + i, 32 - i, "] -> ");
@@ -1988,4 +2152,320 @@ again:
i += scnprintf(buf + i, 32 - i, "]");
return buf;
}
-#endif
+
+/**
+ * tipc_crypto_msg_rcv - Common 'MSG_CRYPTO' processing point
+ * @net: the struct net
+ * @skb: the receiving message buffer
+ */
+void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb)
+{
+ struct tipc_crypto *rx;
+ struct tipc_msg *hdr;
+
+ if (unlikely(skb_linearize(skb)))
+ goto exit;
+
+ hdr = buf_msg(skb);
+ rx = tipc_node_crypto_rx_by_addr(net, msg_prevnode(hdr));
+ if (unlikely(!rx))
+ goto exit;
+
+ switch (msg_type(hdr)) {
+ case KEY_DISTR_MSG:
+ if (tipc_crypto_key_rcv(rx, hdr))
+ goto exit;
+ break;
+ default:
+ break;
+ }
+
+ tipc_node_put(rx->node);
+
+exit:
+ kfree_skb(skb);
+}
+
+/**
+ * tipc_crypto_key_distr - Distribute a TX key
+ * @tx: the TX crypto
+ * @key: the key's index
+ * @dest: the destination tipc node, = NULL if distributing to all nodes
+ *
+ * Return: 0 in case of success, otherwise < 0
+ */
+int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key,
+ struct tipc_node *dest)
+{
+ struct tipc_aead *aead;
+ u32 dnode = tipc_node_get_addr(dest);
+ int rc = -ENOKEY;
+
+ if (!sysctl_tipc_key_exchange_enabled)
+ return 0;
+
+ if (key) {
+ rcu_read_lock();
+ aead = tipc_aead_get(tx->aead[key]);
+ if (likely(aead)) {
+ rc = tipc_crypto_key_xmit(tx->net, aead->key,
+ aead->gen, aead->mode,
+ dnode);
+ tipc_aead_put(aead);
+ }
+ rcu_read_unlock();
+ }
+
+ return rc;
+}
+
+/**
+ * tipc_crypto_key_xmit - Send a session key
+ * @net: the struct net
+ * @skey: the session key to be sent
+ * @gen: the key's generation
+ * @mode: the key's mode
+ * @dnode: the destination node address, = 0 if broadcasting to all nodes
+ *
+ * The session key 'skey' is packed in a TIPC v2 'MSG_CRYPTO/KEY_DISTR_MSG'
+ * as its data section, then xmit-ed through the uc/bc link.
+ *
+ * Return: 0 in case of success, otherwise < 0
+ */
+static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey,
+ u16 gen, u8 mode, u32 dnode)
+{
+ struct sk_buff_head pkts;
+ struct tipc_msg *hdr;
+ struct sk_buff *skb;
+ u16 size, cong_link_cnt;
+ u8 *data;
+ int rc;
+
+ size = tipc_aead_key_size(skey);
+ skb = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = buf_msg(skb);
+ tipc_msg_init(tipc_own_addr(net), hdr, MSG_CRYPTO, KEY_DISTR_MSG,
+ INT_H_SIZE, dnode);
+ msg_set_size(hdr, INT_H_SIZE + size);
+ msg_set_key_gen(hdr, gen);
+ msg_set_key_mode(hdr, mode);
+
+ data = msg_data(hdr);
+ *((__be32 *)(data + TIPC_AEAD_ALG_NAME)) = htonl(skey->keylen);
+ memcpy(data, skey->alg_name, TIPC_AEAD_ALG_NAME);
+ memcpy(data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->key,
+ skey->keylen);
+
+ __skb_queue_head_init(&pkts);
+ __skb_queue_tail(&pkts, skb);
+ if (dnode)
+ rc = tipc_node_xmit(net, &pkts, dnode, 0);
+ else
+ rc = tipc_bcast_xmit(net, &pkts, &cong_link_cnt);
+
+ return rc;
+}
+
+/**
+ * tipc_crypto_key_rcv - Receive a session key
+ * @rx: the RX crypto
+ * @hdr: the TIPC v2 message incl. the receiving session key in its data
+ *
+ * This function retrieves the session key in the message from peer, then
+ * schedules a RX work to attach the key to the corresponding RX crypto.
+ *
+ * Return: "true" if the key has been scheduled for attaching, otherwise
+ * "false".
+ */
+static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr)
+{
+ struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx;
+ struct tipc_aead_key *skey = NULL;
+ u16 key_gen = msg_key_gen(hdr);
+ u16 size = msg_data_sz(hdr);
+ u8 *data = msg_data(hdr);
+
+ spin_lock(&rx->lock);
+ if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) {
+ pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name,
+ rx->skey, key_gen, rx->key_gen);
+ goto exit;
+ }
+
+ /* Allocate memory for the key */
+ skey = kmalloc(size, GFP_ATOMIC);
+ if (unlikely(!skey)) {
+ pr_err("%s: unable to allocate memory for skey\n", rx->name);
+ goto exit;
+ }
+
+ /* Copy key from msg data */
+ skey->keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME)));
+ memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME);
+ memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32),
+ skey->keylen);
+
+ /* Sanity check */
+ if (unlikely(size != tipc_aead_key_size(skey))) {
+ kfree(skey);
+ skey = NULL;
+ goto exit;
+ }
+
+ rx->key_gen = key_gen;
+ rx->skey_mode = msg_key_mode(hdr);
+ rx->skey = skey;
+ rx->nokey = 0;
+ mb(); /* for nokey flag */
+
+exit:
+ spin_unlock(&rx->lock);
+
+ /* Schedule the key attaching on this crypto */
+ if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0)))
+ return true;
+
+ return false;
+}
+
+/**
+ * tipc_crypto_work_rx - Scheduled RX works handler
+ * @work: the struct RX work
+ *
+ * The function processes the previous scheduled works i.e. distributing TX key
+ * or attaching a received session key on RX crypto.
+ */
+static void tipc_crypto_work_rx(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct tipc_crypto *rx = container_of(dwork, struct tipc_crypto, work);
+ struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx;
+ unsigned long delay = msecs_to_jiffies(5000);
+ bool resched = false;
+ u8 key;
+ int rc;
+
+ /* Case 1: Distribute TX key to peer if scheduled */
+ if (atomic_cmpxchg(&rx->key_distr,
+ KEY_DISTR_SCHED,
+ KEY_DISTR_COMPL) == KEY_DISTR_SCHED) {
+ /* Always pick the newest one for distributing */
+ key = tx->key.pending ?: tx->key.active;
+ rc = tipc_crypto_key_distr(tx, key, rx->node);
+ if (unlikely(rc))
+ pr_warn("%s: unable to distr key[%d] to %s, err %d\n",
+ tx->name, key, tipc_node_get_id_str(rx->node),
+ rc);
+
+ /* Sched for key_distr releasing */
+ resched = true;
+ } else {
+ atomic_cmpxchg(&rx->key_distr, KEY_DISTR_COMPL, 0);
+ }
+
+ /* Case 2: Attach a pending received session key from peer if any */
+ if (rx->skey) {
+ rc = tipc_crypto_key_init(rx, rx->skey, rx->skey_mode, false);
+ if (unlikely(rc < 0))
+ pr_warn("%s: unable to attach received skey, err %d\n",
+ rx->name, rc);
+ switch (rc) {
+ case -EBUSY:
+ case -ENOMEM:
+ /* Resched the key attaching */
+ resched = true;
+ break;
+ default:
+ synchronize_rcu();
+ kfree(rx->skey);
+ rx->skey = NULL;
+ break;
+ }
+ }
+
+ if (resched && queue_delayed_work(tx->wq, &rx->work, delay))
+ return;
+
+ tipc_node_put(rx->node);
+}
+
+/**
+ * tipc_crypto_rekeying_sched - (Re)schedule rekeying w/o new interval
+ * @tx: TX crypto
+ * @changed: if the rekeying needs to be rescheduled with new interval
+ * @new_intv: new rekeying interval (when "changed" = true)
+ */
+void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed,
+ u32 new_intv)
+{
+ unsigned long delay;
+ bool now = false;
+
+ if (changed) {
+ if (new_intv == TIPC_REKEYING_NOW)
+ now = true;
+ else
+ tx->rekeying_intv = new_intv;
+ cancel_delayed_work_sync(&tx->work);
+ }
+
+ if (tx->rekeying_intv || now) {
+ delay = (now) ? 0 : tx->rekeying_intv * 60 * 1000;
+ queue_delayed_work(tx->wq, &tx->work, msecs_to_jiffies(delay));
+ }
+}
+
+/**
+ * tipc_crypto_work_tx - Scheduled TX works handler
+ * @work: the struct TX work
+ *
+ * The function processes the previous scheduled work, i.e. key rekeying, by
+ * generating a new session key based on current one, then attaching it to the
+ * TX crypto and finally distributing it to peers. It also re-schedules the
+ * rekeying if needed.
+ */
+static void tipc_crypto_work_tx(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct tipc_crypto *tx = container_of(dwork, struct tipc_crypto, work);
+ struct tipc_aead_key *skey = NULL;
+ struct tipc_key key = tx->key;
+ struct tipc_aead *aead;
+ int rc = -ENOMEM;
+
+ if (unlikely(key.pending))
+ goto resched;
+
+ /* Take current key as a template */
+ rcu_read_lock();
+ aead = rcu_dereference(tx->aead[key.active ?: KEY_MASTER]);
+ if (unlikely(!aead)) {
+ rcu_read_unlock();
+ /* At least one key should exist for securing */
+ return;
+ }
+
+ /* Lets duplicate it first */
+ skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC);
+ rcu_read_unlock();
+
+ /* Now, generate new key, initiate & distribute it */
+ if (likely(skey)) {
+ rc = tipc_aead_key_generate(skey) ?:
+ tipc_crypto_key_init(tx, skey, PER_NODE_KEY, false);
+ if (likely(rc > 0))
+ rc = tipc_crypto_key_distr(tx, rc, NULL);
+ kfree_sensitive(skey);
+ }
+
+ if (unlikely(rc))
+ pr_warn_ratelimited("%s: rekeying returns %d\n", tx->name, rc);
+
+resched:
+ /* Re-schedule rekeying if any */
+ tipc_crypto_rekeying_sched(tx, false, 0);
+}
diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h
index c3de769f49e8..ce7d4cc8a9e0 100644
--- a/net/tipc/crypto.h
+++ b/net/tipc/crypto.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/**
+/*
* net/tipc/crypto.h: Include file for TIPC crypto
*
* Copyright (c) 2019, Ericsson AB
@@ -53,7 +53,7 @@
#define TIPC_AES_GCM_IV_SIZE 12
#define TIPC_AES_GCM_TAG_SIZE 16
-/**
+/*
* TIPC crypto modes:
* - CLUSTER_KEY:
* One single key is used for both TX & RX in all nodes in the cluster.
@@ -67,14 +67,15 @@ enum {
};
extern int sysctl_tipc_max_tfms __read_mostly;
+extern int sysctl_tipc_key_exchange_enabled __read_mostly;
-/**
+/*
* TIPC encryption message format:
*
* 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
* 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * w0:|Ver=7| User |D|TX |RX |K| Rsvd |
+ * w0:|Ver=7| User |D|TX |RX |K|M|N| Rsvd |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* w1:| Seqno |
* w2:| (8 octets) |
@@ -101,6 +102,9 @@ extern int sysctl_tipc_max_tfms __read_mostly;
* RX : Currently RX active key corresponding to the destination
* node's TX key (when the "D" bit is set)
* K : Keep-alive bit (for RPS, LINK_PROTOCOL/STATE_MSG only)
+ * M : Bit indicates if sender has master key
+ * N : Bit indicates if sender has no RX keys corresponding to the
+ * receiver's TX (when the "D" bit is set)
* Rsvd : Reserved bit, field
* Word1-2:
* Seqno : The 64-bit sequence number of the encrypted message, also
@@ -117,7 +121,9 @@ struct tipc_ehdr {
__u8 destined:1,
user:4,
version:3;
- __u8 reserved_1:3,
+ __u8 reserved_1:1,
+ rx_nokey:1,
+ master_key:1,
keepalive:1,
rx_key_active:2,
tx_key:2;
@@ -128,7 +134,9 @@ struct tipc_ehdr {
__u8 tx_key:2,
rx_key_active:2,
keepalive:1,
- reserved_1:3;
+ master_key:1,
+ rx_nokey:1,
+ reserved_1:1;
#else
#error "Please fix <asm/byteorder.h>"
#endif
@@ -158,10 +166,35 @@ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb,
int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx,
struct sk_buff **skb, struct tipc_bearer *b);
int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey,
- u8 mode);
+ u8 mode, bool master_key);
void tipc_crypto_key_flush(struct tipc_crypto *c);
-int tipc_aead_key_validate(struct tipc_aead_key *ukey);
+int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key,
+ struct tipc_node *dest);
+void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb);
+void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed,
+ u32 new_intv);
+int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info);
bool tipc_ehdr_validate(struct sk_buff *skb);
+static inline u32 msg_key_gen(struct tipc_msg *m)
+{
+ return msg_bits(m, 4, 16, 0xffff);
+}
+
+static inline void msg_set_key_gen(struct tipc_msg *m, u32 gen)
+{
+ msg_set_bits(m, 4, 16, 0xffff, gen);
+}
+
+static inline u32 msg_key_mode(struct tipc_msg *m)
+{
+ return msg_bits(m, 4, 0, 0xf);
+}
+
+static inline void msg_set_key_mode(struct tipc_msg *m, u32 mode)
+{
+ msg_set_bits(m, 4, 0, 0xf, mode);
+}
+
#endif /* _TIPC_CRYPTO_H */
#endif
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index d4ecacddb40c..5380f605b851 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -74,6 +74,7 @@ struct tipc_discoverer {
/**
* tipc_disc_init_msg - initialize a link setup message
* @net: the applicable net namespace
+ * @skb: buffer containing message
* @mtyp: message type (request or response)
* @b: ptr to bearer issuing message
*/
@@ -341,7 +342,7 @@ exit:
* @dest: destination address for request messages
* @skb: pointer to created frame
*
- * Returns 0 if successful, otherwise -errno.
+ * Return: 0 if successful, otherwise -errno.
*/
int tipc_disc_create(struct net *net, struct tipc_bearer *b,
struct tipc_media_addr *dest, struct sk_buff **skb)
@@ -380,7 +381,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b,
/**
* tipc_disc_delete - destroy object sending periodic link setup requests
- * @d: ptr to link duest structure
+ * @d: ptr to link dest structure
*/
void tipc_disc_delete(struct tipc_discoverer *d)
{
diff --git a/net/tipc/group.c b/net/tipc/group.c
index b1fcd2ad5ecf..3e137d8c9d2f 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -2,6 +2,7 @@
* net/tipc/group.c: TIPC group messaging code
*
* Copyright (c) 2017, Ericsson AB
+ * Copyright (c) 2020, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -359,7 +360,7 @@ struct tipc_nlist *tipc_group_dests(struct tipc_group *grp)
return &grp->dests;
}
-void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq,
+void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq,
int *scope)
{
seq->type = grp->type;
diff --git a/net/tipc/group.h b/net/tipc/group.h
index 76b4e5a7b39d..ea4c3be64c78 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -2,6 +2,7 @@
* net/tipc/group.h: Include file for TIPC group unicast/multicast functions
*
* Copyright (c) 2017, Ericsson AB
+ * Copyright (c) 2020, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -50,7 +51,7 @@ void tipc_group_delete(struct net *net, struct tipc_group *grp);
void tipc_group_add_member(struct tipc_group *grp, u32 node,
u32 port, u32 instance);
struct tipc_nlist *tipc_group_dests(struct tipc_group *grp);
-void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq,
+void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq,
int *scope);
u32 tipc_group_exclude(struct tipc_group *grp);
void tipc_group_filter_msg(struct tipc_group *grp,
diff --git a/net/tipc/link.c b/net/tipc/link.c
index cef38a910107..115109259430 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -120,6 +120,34 @@ struct tipc_stats {
* @reasm_buf: head of partially reassembled inbound message fragments
* @bc_rcvr: marks that this is a broadcast receiver link
* @stats: collects statistics regarding link activity
+ * @session: session to be used by link
+ * @snd_nxt_state: next send seq number
+ * @rcv_nxt_state: next rcv seq number
+ * @in_session: have received ACTIVATE_MSG from peer
+ * @active: link is active
+ * @if_name: associated interface name
+ * @rst_cnt: link reset counter
+ * @drop_point: seq number for failover handling (FIXME)
+ * @failover_reasm_skb: saved failover msg ptr (FIXME)
+ * @failover_deferdq: deferred message queue for failover processing (FIXME)
+ * @transmq: the link's transmit queue
+ * @backlog: link's backlog by priority (importance)
+ * @snd_nxt: next sequence number to be used
+ * @rcv_unacked: # messages read by user, but not yet acked back to peer
+ * @deferdq: deferred receive queue
+ * @window: sliding window size for congestion handling
+ * @min_win: minimal send window to be used by link
+ * @ssthresh: slow start threshold for congestion handling
+ * @max_win: maximal send window to be used by link
+ * @cong_acks: congestion acks for congestion avoidance (FIXME)
+ * @checkpoint: seq number for congestion window size handling
+ * @reasm_tnlmsg: fragmentation/reassembly area for tunnel protocol message
+ * @last_gap: last gap ack blocks for bcast (FIXME)
+ * @last_ga: ptr to gap ack blocks
+ * @bc_rcvlink: the peer specific link used for broadcast reception
+ * @bc_sndlink: the namespace global link used for broadcast sending
+ * @nack_state: bcast nack state
+ * @bc_peer_is_up: peer has acked the bcast init msg
*/
struct tipc_link {
u32 addr;
@@ -216,11 +244,6 @@ enum {
#define TIPC_BC_RETR_LIM (jiffies + msecs_to_jiffies(10))
#define TIPC_UC_RETR_TIME (jiffies + msecs_to_jiffies(1))
-/*
- * Interval between NACKs when packets arrive out of order
- */
-#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
-
/* Link FSM states:
*/
enum {
@@ -455,7 +478,6 @@ u32 tipc_link_state(struct tipc_link *l)
* @min_win: minimal send window to be used by link
* @max_win: maximal send window to be used by link
* @session: session to be used by link
- * @ownnode: identity of own node
* @peer: node id of peer node
* @peer_caps: bitmap describing peer node capabilities
* @bc_sndlink: the namespace global link used for broadcast sending
@@ -463,8 +485,10 @@ u32 tipc_link_state(struct tipc_link *l)
* @inputq: queue to put messages ready for delivery
* @namedq: queue to put binding table update messages ready for delivery
* @link: return value, pointer to put the created link
+ * @self: local unicast link id
+ * @peer_id: 128-bit ID of peer
*
- * Returns true if link was created, otherwise false
+ * Return: true if link was created, otherwise false
*/
bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
int tolerance, char net_plane, u32 mtu, int priority,
@@ -537,8 +561,13 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
* @inputq: queue to put messages ready for delivery
* @namedq: queue to put binding table update messages ready for delivery
* @link: return value, pointer to put the created link
+ * @ownnode: identity of own node
+ * @peer: node id of peer node
+ * @peer_id: 128-bit ID of peer
+ * @peer_caps: bitmap describing peer node capabilities
+ * @bc_sndlink: the namespace global link used for broadcast sending
*
- * Returns true if link was created, otherwise false
+ * Return: true if link was created, otherwise false
*/
bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id,
int mtu, u32 min_win, u32 max_win, u16 peer_caps,
@@ -793,7 +822,7 @@ static void link_profile_stats(struct tipc_link *l)
* tipc_link_too_silent - check if link is "too silent"
* @l: tipc link to be checked
*
- * Returns true if the link 'silent_intv_cnt' is about to reach the
+ * Return: true if the link 'silent_intv_cnt' is about to reach the
* 'abort_limit' value, otherwise false
*/
bool tipc_link_too_silent(struct tipc_link *l)
@@ -995,13 +1024,12 @@ void tipc_link_reset(struct tipc_link *l)
* @xmitq: returned list of packets to be sent by caller
*
* Consumes the buffer chain.
- * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
* Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
+ * Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
*/
int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
struct sk_buff_head *xmitq)
{
- struct tipc_msg *hdr = buf_msg(skb_peek(list));
struct sk_buff_head *backlogq = &l->backlogq;
struct sk_buff_head *transmq = &l->transmq;
struct sk_buff *skb, *_skb;
@@ -1009,13 +1037,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
u16 ack = l->rcv_nxt - 1;
u16 seqno = l->snd_nxt;
int pkt_cnt = skb_queue_len(list);
- int imp = msg_importance(hdr);
unsigned int mss = tipc_link_mss(l);
unsigned int cwin = l->window;
unsigned int mtu = l->mtu;
+ struct tipc_msg *hdr;
bool new_bundle;
int rc = 0;
+ int imp;
+
+ if (pkt_cnt <= 0)
+ return 0;
+ hdr = buf_msg(skb_peek(list));
if (unlikely(msg_size(hdr) > mtu)) {
pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n",
skb_queue_len(list), msg_user(hdr),
@@ -1024,6 +1057,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
return -EMSGSIZE;
}
+ imp = msg_importance(hdr);
/* Allow oversubscription of one data msg per source at congestion */
if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
if (imp == TIPC_SYSTEM_IMPORTANCE) {
@@ -1256,11 +1290,16 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
case MSG_FRAGMENTER:
case BCAST_PROTOCOL:
return false;
+#ifdef CONFIG_TIPC_CRYPTO
+ case MSG_CRYPTO:
+ tipc_crypto_msg_rcv(l->net, skb);
+ return true;
+#endif
default:
pr_warn("Dropping received illegal msg type\n");
kfree_skb(skb);
return true;
- };
+ }
}
/* tipc_link_input - process packet that has passed link protocol check
@@ -2376,7 +2415,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
if (!msg_peer_node_is_up(hdr))
return rc;
- /* Open when peer ackowledges our bcast init msg (pkt #1) */
+ /* Open when peer acknowledges our bcast init msg (pkt #1) */
if (msg_ack(hdr))
l->bc_peer_is_up = true;
@@ -2505,7 +2544,7 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win)
}
/**
- * link_reset_stats - reset link statistics
+ * tipc_link_reset_stats - reset link statistics
* @l: pointer to link
*/
void tipc_link_reset_stats(struct tipc_link *l)
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 52e93ba4d8e2..2aca86021df5 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -58,11 +58,13 @@ static unsigned int align(unsigned int i)
/**
* tipc_buf_acquire - creates a TIPC message buffer
* @size: message size (including TIPC header)
+ * @gfp: memory allocation flags
*
- * Returns a new buffer with data pointers set to the specified size.
+ * Return: a new buffer with data pointers set to the specified size.
*
- * NOTE: Headroom is reserved to allow prepending of a data link header.
- * There may also be unrequested tailroom present at the buffer's end.
+ * NOTE:
+ * Headroom is reserved to allow prepending of a data link header.
+ * There may also be unrequested tailroom present at the buffer's end.
*/
struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp)
{
@@ -150,11 +152,11 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
if (fragid == FIRST_FRAGMENT) {
if (unlikely(head))
goto err;
+ *buf = NULL;
frag = skb_unshare(frag, GFP_ATOMIC);
if (unlikely(!frag))
goto err;
head = *headbuf = frag;
- *buf = NULL;
TIPC_SKB_CB(head)->tail = NULL;
if (skb_is_nonlinear(head)) {
skb_walk_frags(head, tail) {
@@ -207,8 +209,9 @@ err:
* @m: the data to be appended
* @mss: max allowable size of buffer
* @dlen: size of data to be appended
- * @txq: queue to appand to
- * Returns the number og 1k blocks appended or errno value
+ * @txq: queue to append to
+ *
+ * Return: the number of 1k blocks appended or errno value
*/
int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen,
int mss, struct sk_buff_head *txq)
@@ -312,7 +315,7 @@ bool tipc_msg_validate(struct sk_buff **_skb)
* @pktmax: max size of a fragment incl. the header
* @frags: returned fragment skb list
*
- * Returns 0 if the fragmentation is successful, otherwise: -EINVAL
+ * Return: 0 if the fragmentation is successful, otherwise: -EINVAL
* or -ENOMEM
*/
int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr,
@@ -367,6 +370,7 @@ error:
* tipc_msg_build - create buffer chain containing specified header and data
* @mhdr: Message header, to be prepended to data
* @m: User message
+ * @offset: buffer offset for fragmented messages (FIXME)
* @dsz: Total length of user data
* @pktmax: Max packet size that can be used
* @list: Buffer or chain of buffers to be returned to caller
@@ -374,7 +378,7 @@ error:
* Note that the recursive call we are making here is safe, since it can
* logically go only one further level down.
*
- * Returns message data size or errno: -ENOMEM, -EFAULT
+ * Return: message data size or errno: -ENOMEM, -EFAULT
*/
int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset,
int dsz, int pktmax, struct sk_buff_head *list)
@@ -485,7 +489,7 @@ error:
* @msg: message to be appended
* @max: max allowable size for the bundle buffer
*
- * Returns "true" if bundling has been performed, otherwise "false"
+ * Return: "true" if bundling has been performed, otherwise "false"
*/
static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg,
u32 max)
@@ -580,9 +584,9 @@ bundle:
* @skb: buffer to be extracted from.
* @iskb: extracted inner buffer, to be returned
* @pos: position in outer message of msg to be extracted.
- * Returns position of next msg
+ * Returns position of next msg.
* Consumes outer buffer when last packet extracted
- * Returns true when when there is an extracted buffer, otherwise false
+ * Return: true when there is an extracted buffer, otherwise false
*/
bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos)
{
@@ -626,7 +630,7 @@ none:
* @skb: buffer containing message to be reversed; will be consumed
* @err: error code to be set in message, if any
* Replaces consumed buffer with new one when successful
- * Returns true if success, otherwise false
+ * Return: true if success, otherwise false
*/
bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
{
@@ -698,10 +702,11 @@ bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy)
/**
* tipc_msg_lookup_dest(): try to find new destination for named message
+ * @net: pointer to associated network namespace
* @skb: the buffer containing the message.
* @err: error code to be used by caller if lookup fails
* Does not consume buffer
- * Returns true if a destination is found, false otherwise
+ * Return: true if a destination is found, false otherwise
*/
bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
{
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 1016e96db5c4..5d64596ba987 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -82,6 +82,7 @@ struct plist;
#define NAME_DISTRIBUTOR 11
#define MSG_FRAGMENTER 12
#define LINK_CONFIG 13
+#define MSG_CRYPTO 14
#define SOCK_WAKEUP 14 /* pseudo user */
#define TOP_SRV 15 /* pseudo user */
@@ -127,7 +128,9 @@ struct tipc_skb_cb {
#ifdef CONFIG_TIPC_CRYPTO
u8 encrypted:1;
u8 decrypted:1;
- u8 probe:1;
+#define SKB_PROBING 1
+#define SKB_GRACING 2
+ u8 xmit_type:2;
u8 tx_clone_deferred:1;
#endif
};
@@ -747,6 +750,9 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n)
#define GRP_RECLAIM_MSG 4
#define GRP_REMIT_MSG 5
+/* Crypto message types */
+#define KEY_DISTR_MSG 0
+
/*
* Word 1
*/
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 2f9c148f17e2..6cf57c3bfa27 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -50,6 +50,8 @@ struct distr_queue_item {
/**
* publ_to_item - add publication info to a publication message
+ * @p: publication info
+ * @i: location of item in the message
*/
static void publ_to_item(struct distr_item *i, struct publication *p)
{
@@ -62,6 +64,10 @@ static void publ_to_item(struct distr_item *i, struct publication *p)
/**
* named_prepare_buf - allocate & initialize a publication message
+ * @net: the associated network namespace
+ * @type: message type
+ * @size: payload size
+ * @dest: destination node
*
* The buffer returned is of size INT_H_SIZE + payload size
*/
@@ -83,6 +89,8 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
/**
* tipc_named_publish - tell other nodes about a new publication by this node
+ * @net: the associated network namespace
+ * @publ: the new publication
*/
struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
{
@@ -111,6 +119,8 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
/**
* tipc_named_withdraw - tell other nodes about a withdrawn publication by this node
+ * @net: the associated network namespace
+ * @publ: the withdrawn publication
*/
struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ)
{
@@ -138,9 +148,11 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ)
/**
* named_distribute - prepare name info for bulk distribution to another node
+ * @net: the associated network namespace
* @list: list of messages (buffers) to be returned from this function
* @dnode: node to be updated
* @pls: linked list of publication items to be packed into buffer chain
+ * @seqno: sequence number for this message
*/
static void named_distribute(struct net *net, struct sk_buff_head *list,
u32 dnode, struct list_head *pls, u16 seqno)
@@ -194,6 +206,9 @@ static void named_distribute(struct net *net, struct sk_buff_head *list,
/**
* tipc_named_node_up - tell specified node about all publications by this node
+ * @net: the associated network namespace
+ * @dnode: destination node
+ * @capabilities: peer node's capabilities
*/
void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities)
{
@@ -217,6 +232,9 @@ void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities)
/**
* tipc_publ_purge - remove publication associated with a failed node
+ * @net: the associated network namespace
+ * @publ: the publication to remove
+ * @addr: failed node's address
*
* Invoked for each publication issued by a newly failed node.
* Removes publication structure from name table & deletes it.
@@ -244,24 +262,6 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr)
kfree_rcu(p, rcu);
}
-/**
- * tipc_dist_queue_purge - remove deferred updates from a node that went down
- */
-static void tipc_dist_queue_purge(struct net *net, u32 addr)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct distr_queue_item *e, *tmp;
-
- spin_lock_bh(&tn->nametbl_lock);
- list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) {
- if (e->node != addr)
- continue;
- list_del(&e->next);
- kfree(e);
- }
- spin_unlock_bh(&tn->nametbl_lock);
-}
-
void tipc_publ_notify(struct net *net, struct list_head *nsub_list,
u32 addr, u16 capabilities)
{
@@ -272,7 +272,6 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list,
list_for_each_entry_safe(publ, tmp, nsub_list, binding_node)
tipc_publ_purge(net, publ, addr);
- tipc_dist_queue_purge(net, addr);
spin_lock_bh(&tn->nametbl_lock);
if (!(capabilities & TIPC_NAMED_BCAST))
nt->rc_dests--;
@@ -282,9 +281,13 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list,
/**
* tipc_update_nametbl - try to process a nametable update and notify
* subscribers
+ * @net: the associated network namespace
+ * @i: location of item in the message
+ * @node: node address
+ * @dtype: name distributor message type
*
* tipc_nametbl_lock must be held.
- * Returns the publication item if successful, otherwise NULL.
+ * Return: the publication item if successful, otherwise NULL.
*/
static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
u32 node, u32 dtype)
@@ -327,8 +330,13 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq,
struct tipc_msg *hdr;
u16 seqno;
+ spin_lock_bh(&namedq->lock);
skb_queue_walk_safe(namedq, skb, tmp) {
- skb_linearize(skb);
+ if (unlikely(skb_linearize(skb))) {
+ __skb_unlink(skb, namedq);
+ kfree_skb(skb);
+ continue;
+ }
hdr = buf_msg(skb);
seqno = msg_named_seqno(hdr);
if (msg_is_last_bulk(hdr)) {
@@ -338,12 +346,14 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq,
if (msg_is_bulk(hdr) || msg_is_legacy(hdr)) {
__skb_unlink(skb, namedq);
+ spin_unlock_bh(&namedq->lock);
return skb;
}
if (*open && (*rcv_nxt == seqno)) {
(*rcv_nxt)++;
__skb_unlink(skb, namedq);
+ spin_unlock_bh(&namedq->lock);
return skb;
}
@@ -353,11 +363,16 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq,
continue;
}
}
+ spin_unlock_bh(&namedq->lock);
return NULL;
}
/**
* tipc_named_rcv - process name table update messages sent by another node
+ * @net: the associated network namespace
+ * @namedq: queue to receive from
+ * @rcv_nxt: store last received seqno here
+ * @open: last bulk msg was received (FIXME)
*/
void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq,
u16 *rcv_nxt, bool *open)
@@ -385,6 +400,7 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq,
/**
* tipc_named_reinit - re-initialize local publications
+ * @net: the associated network namespace
*
* This routine is called whenever TIPC networking is enabled.
* All name table entries published by this node are updated to reflect
diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h
index 092323158f06..e231e6964d61 100644
--- a/net/tipc/name_distr.h
+++ b/net/tipc/name_distr.h
@@ -46,7 +46,7 @@
* @type: name sequence type
* @lower: name sequence lower bound
* @upper: name sequence upper bound
- * @ref: publishing port reference
+ * @port: publishing port reference
* @key: publication key
*
* ===> All fields are stored in network byte order. <===
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 2ac33d32edc2..ee5ac40ea2b6 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -3,6 +3,7 @@
*
* Copyright (c) 2000-2006, 2014-2018, Ericsson AB
* Copyright (c) 2004-2008, 2010-2014, Wind River Systems
+ * Copyright (c) 2020, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -103,7 +104,8 @@ RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks,
* range match
* @sr: the service range pointer as a loop cursor
* @sc: the pointer to tipc service which holds the service range rbtree
- * @start, end: the range (end >= start) for matching
+ * @start: beginning of the search range (end >= start) for matching
+ * @end: end of the search range (end >= start) for matching
*/
#define service_range_foreach_match(sr, sc, start, end) \
for (sr = service_range_match_first((sc)->ranges.rb_node, \
@@ -117,7 +119,8 @@ RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks,
/**
* service_range_match_first - find first service range matching a range
* @n: the root node of service range rbtree for searching
- * @start, end: the range (end >= start) for matching
+ * @start: beginning of the search range (end >= start) for matching
+ * @end: end of the search range (end >= start) for matching
*
* Return: the leftmost service range node in the rbtree that overlaps the
* specific range if any. Otherwise, returns NULL.
@@ -166,7 +169,8 @@ static struct service_range *service_range_match_first(struct rb_node *n,
/**
* service_range_match_next - find next service range matching a range
* @n: a node in service range rbtree from which the searching starts
- * @start, end: the range (end >= start) for matching
+ * @start: beginning of the search range (end >= start) for matching
+ * @end: end of the search range (end >= start) for matching
*
* Return: the next service range node to the given node in the rbtree that
* overlaps the specific range if any. Otherwise, returns NULL.
@@ -218,6 +222,13 @@ static int hash(int x)
/**
* tipc_publ_create - create a publication structure
+ * @type: name sequence type
+ * @lower: name sequence lower bound
+ * @upper: name sequence upper bound
+ * @scope: publication scope
+ * @node: network address of publishing socket
+ * @port: publishing port
+ * @key: publication key
*/
static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper,
u32 scope, u32 node, u32 port,
@@ -245,6 +256,8 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper,
/**
* tipc_service_create - create a service structure for the specified 'type'
+ * @type: service type
+ * @hd: name_table services list
*
* Allocates a single range structure and sets it to all 0's.
*/
@@ -361,6 +374,9 @@ err:
/**
* tipc_service_remove_publ - remove a publication from a service
+ * @sr: service_range to remove publication from
+ * @node: target node
+ * @key: target publication key
*/
static struct publication *tipc_service_remove_publ(struct service_range *sr,
u32 node, u32 key)
@@ -377,7 +393,7 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr,
return NULL;
}
-/**
+/*
* Code reused: time_after32() for the same purpose
*/
#define publication_after(pa, pb) time_after32((pa)->id, (pb)->id)
@@ -395,6 +411,8 @@ static int tipc_publ_sort(void *priv, struct list_head *a,
* tipc_service_subscribe - attach a subscription, and optionally
* issue the prescribed number of events if there is any service
* range overlapping with the requested range
+ * @service: the tipc_service to attach the @sub to
+ * @sub: the subscription to attach
*/
static void tipc_service_subscribe(struct tipc_service *service,
struct tipc_subscription *sub)
@@ -403,12 +421,12 @@ static void tipc_service_subscribe(struct tipc_service *service,
struct publication *p, *first, *tmp;
struct list_head publ_list;
struct service_range *sr;
- struct tipc_name_seq ns;
+ struct tipc_service_range r;
u32 filter;
- ns.type = tipc_sub_read(sb, seq.type);
- ns.lower = tipc_sub_read(sb, seq.lower);
- ns.upper = tipc_sub_read(sb, seq.upper);
+ r.type = tipc_sub_read(sb, seq.type);
+ r.lower = tipc_sub_read(sb, seq.lower);
+ r.upper = tipc_sub_read(sb, seq.upper);
filter = tipc_sub_read(sb, filter);
tipc_sub_get(sub);
@@ -418,7 +436,7 @@ static void tipc_service_subscribe(struct tipc_service *service,
return;
INIT_LIST_HEAD(&publ_list);
- service_range_foreach_match(sr, service, ns.lower, ns.upper) {
+ service_range_foreach_match(sr, service, r.lower, r.upper) {
first = NULL;
list_for_each_entry(p, &sr->all_publ, all_publ) {
if (filter & TIPC_SUB_PORTS)
@@ -528,14 +546,16 @@ exit:
/**
* tipc_nametbl_translate - perform service instance to socket translation
- *
- * On entry, 'dnode' is the search domain used during translation.
+ * @net: network namespace
+ * @type: message type
+ * @instance: message instance
+ * @dnode: the search domain used during translation
*
* On exit:
* - if translation is deferred to another node, leave 'dnode' unchanged and
- * return 0
+ * return 0
* - if translation is attempted and succeeds, set 'dnode' to the publishing
- * node and return the published (non-zero) port number
+ * node and return the published (non-zero) port number
* - if translation is attempted and fails, set 'dnode' to 0 and return 0
*
* Note that for legacy users (node configured with Z.C.N address format) the
@@ -756,6 +776,11 @@ exit:
/**
* tipc_nametbl_withdraw - withdraw a service binding
+ * @net: network namespace
+ * @type: service type
+ * @lower: service range lower bound
+ * @upper: service range upper bound
+ * @key: target publication key
*/
int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower,
u32 upper, u32 key)
@@ -791,6 +816,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower,
/**
* tipc_nametbl_subscribe - add a subscription object to the name table
+ * @sub: subscription to add
*/
bool tipc_nametbl_subscribe(struct tipc_subscription *sub)
{
@@ -821,6 +847,7 @@ bool tipc_nametbl_subscribe(struct tipc_subscription *sub)
/**
* tipc_nametbl_unsubscribe - remove a subscription object from name table
+ * @sub: subscription to remove
*/
void tipc_nametbl_unsubscribe(struct tipc_subscription *sub)
{
@@ -870,7 +897,9 @@ int tipc_nametbl_init(struct net *net)
}
/**
- * tipc_service_delete - purge all publications for a service and delete it
+ * tipc_service_delete - purge all publications for a service and delete it
+ * @net: the associated network namespace
+ * @sc: tipc_service to delete
*/
static void tipc_service_delete(struct net *net, struct tipc_service *sc)
{
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 8064e1986e2c..5a82a01369d6 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -60,8 +60,8 @@ struct tipc_group;
* @key: publication key, unique across the cluster
* @id: publication id
* @binding_node: all publications from the same node which bound this one
- * - Remote publications: in node->publ_list
- * Used by node/name distr to withdraw publications when node is lost
+ * - Remote publications: in node->publ_list;
+ * Used by node/name distr to withdraw publications when node is lost
* - Local/node scope publications: in name_table->node_scope list
* - Local/cluster scope publications: in name_table->cluster_scope list
* @binding_sock: all publications from the same socket which bound this one
@@ -92,13 +92,16 @@ struct publication {
/**
* struct name_table - table containing all existing port name publications
- * @seq_hlist: name sequence hash lists
+ * @services: name sequence hash lists
* @node_scope: all local publications with node scope
* - used by name_distr during re-init of name table
* @cluster_scope: all local publications with cluster scope
* - used by name_distr to send bulk updates to new nodes
* - used by name_distr during re-init of name table
+ * @cluster_scope_lock: lock for accessing @cluster_scope
* @local_publ_count: number of publications issued by this node
+ * @rc_dests: destination node counter
+ * @snd_nxt: next sequence number to be used
*/
struct name_table {
struct hlist_head services[TIPC_NAMETBL_SIZE];
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 85400e4242de..a129f661bee3 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -105,12 +105,6 @@
* - A local spin_lock protecting the queue of subscriber events.
*/
-struct tipc_net_work {
- struct work_struct work;
- struct net *net;
- u32 addr;
-};
-
static void tipc_net_finalize(struct net *net, u32 addr);
int tipc_net_init(struct net *net, u8 *node_id, u32 addr)
@@ -138,29 +132,25 @@ static void tipc_net_finalize(struct net *net, u32 addr)
tipc_named_reinit(net);
tipc_sk_reinit(net);
tipc_mon_reinit_self(net);
- tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr,
+ tipc_nametbl_publish(net, TIPC_NODE_STATE, addr, addr,
TIPC_CLUSTER_SCOPE, 0, addr);
}
-static void tipc_net_finalize_work(struct work_struct *work)
+void tipc_net_finalize_work(struct work_struct *work)
{
struct tipc_net_work *fwork;
fwork = container_of(work, struct tipc_net_work, work);
tipc_net_finalize(fwork->net, fwork->addr);
- kfree(fwork);
}
void tipc_sched_net_finalize(struct net *net, u32 addr)
{
- struct tipc_net_work *fwork = kzalloc(sizeof(*fwork), GFP_ATOMIC);
+ struct tipc_net *tn = tipc_net(net);
- if (!fwork)
- return;
- INIT_WORK(&fwork->work, tipc_net_finalize_work);
- fwork->net = net;
- fwork->addr = addr;
- schedule_work(&fwork->work);
+ tn->final_work.net = net;
+ tn->final_work.addr = addr;
+ schedule_work(&tn->final_work.work);
}
void tipc_net_stop(struct net *net)
diff --git a/net/tipc/net.h b/net/tipc/net.h
index 6740d97c706e..d0c91d2df20a 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -42,6 +42,7 @@
extern const struct nla_policy tipc_nl_net_policy[];
int tipc_net_init(struct net *net, u8 *node_id, u32 addr);
+void tipc_net_finalize_work(struct work_struct *work);
void tipc_sched_net_finalize(struct net *net, u32 addr);
void tipc_net_stop(struct net *net);
int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index c4aee6247d55..c447cb5f879e 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -108,6 +108,8 @@ const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
.len = TIPC_NODEID_LEN},
[TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY,
.len = TIPC_AEAD_KEY_SIZE_MAX},
+ [TIPC_NLA_NODE_KEY_MASTER] = { .type = NLA_FLAG },
+ [TIPC_NLA_NODE_REKEYING] = { .type = NLA_U32 },
};
/* Properties valid for media, bearer and link */
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 90e3c70a91ad..5a1ce64039f7 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -118,7 +118,8 @@ static void tipc_tlv_init(struct sk_buff *skb, u16 type)
skb_put(skb, sizeof(struct tlv_desc));
}
-static int tipc_tlv_sprintf(struct sk_buff *skb, const char *fmt, ...)
+static __printf(2, 3) int tipc_tlv_sprintf(struct sk_buff *skb,
+ const char *fmt, ...)
{
int n;
u16 len;
@@ -212,12 +213,14 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
}
info.attrs = attrbuf;
- err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf,
- tipc_genl_family.maxattr,
- tipc_genl_family.policy, NULL);
- if (err)
- goto err_out;
+ if (nlmsg_len(cb.nlh) > 0) {
+ err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf,
+ tipc_genl_family.maxattr,
+ tipc_genl_family.policy, NULL);
+ if (err)
+ goto err_out;
+ }
do {
int rem;
@@ -588,7 +591,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
return 0;
tipc_tlv_sprintf(msg->rep, "\nLink <%s>\n",
- nla_data(link[TIPC_NLA_LINK_NAME]));
+ (char *)nla_data(link[TIPC_NLA_LINK_NAME]));
if (link[TIPC_NLA_LINK_BROADCAST]) {
__fill_bc_link_stat(msg, prop, stats);
@@ -695,7 +698,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]);
link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP]));
- nla_strlcpy(link_info.str, link[TIPC_NLA_LINK_NAME],
+ nla_strscpy(link_info.str, link[TIPC_NLA_LINK_NAME],
TIPC_MAX_LINK_NAME);
return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO,
@@ -1337,7 +1340,7 @@ send:
return err;
}
-static const struct genl_ops tipc_genl_compat_ops[] = {
+static const struct genl_small_ops tipc_genl_compat_ops[] = {
{
.cmd = TIPC_GENL_CMD,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -1352,8 +1355,8 @@ static struct genl_family tipc_genl_compat_family __ro_after_init = {
.maxattr = 0,
.netnsok = true,
.module = THIS_MODULE,
- .ops = tipc_genl_compat_ops,
- .n_ops = ARRAY_SIZE(tipc_genl_compat_ops),
+ .small_ops = tipc_genl_compat_ops,
+ .n_small_ops = ARRAY_SIZE(tipc_genl_compat_ops),
};
int __init tipc_netlink_compat_start(void)
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 4edcee3088da..008670d1f43e 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -82,7 +82,7 @@ struct tipc_bclink_entry {
/**
* struct tipc_node - TIPC node structure
* @addr: network address of node
- * @ref: reference counter to node object
+ * @kref: reference counter to node object
* @lock: rwlock governing access to structure
* @net: the applicable net namespace
* @hash: links to adjacent nodes in unsorted hash chain
@@ -90,9 +90,11 @@ struct tipc_bclink_entry {
* @namedq: pointer to name table input queue with name table messages
* @active_links: bearer ids of active links, used as index into links[] array
* @links: array containing references to all links to node
+ * @bc_entry: broadcast link entry
* @action_flags: bit mask of different types of node actions
* @state: connectivity state vs peer node
* @preliminary: a preliminary node or not
+ * @failover_sent: failover sent or not
* @sync_point: sequence number where synch/failover is finished
* @list: links to adjacent nodes in sorted list of cluster's nodes
* @working_links: number of working links to node (both active and standby)
@@ -100,9 +102,16 @@ struct tipc_bclink_entry {
* @capabilities: bitmap, indicating peer node's functional capabilities
* @signature: node instance identifier
* @link_id: local and remote bearer ids of changing link, if any
+ * @peer_id: 128-bit ID of peer
+ * @peer_id_string: ID string of peer
* @publ_list: list of publications
+ * @conn_sks: list of connections (FIXME)
+ * @timer: node's keepalive timer
+ * @keepalive_intv: keepalive interval in milliseconds
* @rcu: rcu struct for tipc_node
* @delete_at: indicates the time for deleting a down node
+ * @peer_net: peer's net namespace
+ * @peer_hash_mix: hash for this peer (FIXME)
* @crypto_rx: RX crypto handler
*/
struct tipc_node {
@@ -267,6 +276,7 @@ char *tipc_node_get_id_str(struct tipc_node *node)
#ifdef CONFIG_TIPC_CRYPTO
/**
* tipc_node_crypto_rx - Retrieve crypto RX handle from node
+ * @__n: target tipc_node
* Note: node ref counter must be held first!
*/
struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n)
@@ -278,6 +288,14 @@ struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos)
{
return container_of(pos, struct tipc_node, list)->crypto_rx;
}
+
+struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr)
+{
+ struct tipc_node *n;
+
+ n = tipc_node_find(net, addr);
+ return (n) ? n->crypto_rx : NULL;
+}
#endif
static void tipc_node_free(struct rcu_head *rp)
@@ -303,7 +321,7 @@ void tipc_node_put(struct tipc_node *node)
kref_put(&node->kref, tipc_node_kref_release);
}
-static void tipc_node_get(struct tipc_node *node)
+void tipc_node_get(struct tipc_node *node)
{
kref_get(&node->kref);
}
@@ -584,6 +602,9 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
static void tipc_node_delete_from_list(struct tipc_node *node)
{
+#ifdef CONFIG_TIPC_CRYPTO
+ tipc_crypto_key_flush(node->crypto_rx);
+#endif
list_del_rcu(&node->list);
hlist_del_rcu(&node->hash);
tipc_node_put(node);
@@ -803,6 +824,9 @@ static void tipc_node_timeout(struct timer_list *t)
/**
* __tipc_node_link_up - handle addition of link
+ * @n: target tipc_node
+ * @bearer_id: id of the bearer
+ * @xmitq: queue for messages to be xmited on
* Node lock must be held by caller
* Link becomes active (alone or shared) or standby, depending on its priority.
*/
@@ -869,6 +893,9 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
/**
* tipc_node_link_up - handle addition of link
+ * @n: target tipc_node
+ * @bearer_id: id of the bearer
+ * @xmitq: queue for messages to be xmited on
*
* Link becomes active (alone or shared) or standby, depending on its priority.
*/
@@ -889,10 +916,11 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id,
*
* This function is only called in a very special situation where link
* failover can be already started on peer node but not on this node.
- * This can happen when e.g.
+ * This can happen when e.g.::
+ *
* 1. Both links <1A-2A>, <1B-2B> down
* 2. Link endpoint 2A up, but 1A still down (e.g. due to network
- * disturbance, wrong session, etc.)
+ * disturbance, wrong session, etc.)
* 3. Link <1B-2B> up
* 4. Link endpoint 2A down (e.g. due to link tolerance timeout)
* 5. Node 2 starts failover onto link <1B-2B>
@@ -929,6 +957,10 @@ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l,
/**
* __tipc_node_link_down - handle loss of link
+ * @n: target tipc_node
+ * @bearer_id: id of the bearer
+ * @xmitq: queue for messages to be xmited on
+ * @maddr: output media address of the bearer
*/
static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
struct sk_buff_head *xmitq,
@@ -1485,7 +1517,7 @@ static void node_lost_contact(struct tipc_node *n,
/* Clean up broadcast state */
tipc_bcast_remove_peer(n->net, n->bc_entry.link);
- __skb_queue_purge(&n->bc_entry.namedq);
+ skb_queue_purge(&n->bc_entry.namedq);
/* Abort any ongoing link failover */
for (i = 0; i < MAX_BEARERS; i++) {
@@ -1514,11 +1546,13 @@ static void node_lost_contact(struct tipc_node *n,
/**
* tipc_node_get_linkname - get the name of a link
*
+ * @net: the applicable net namespace
* @bearer_id: id of the bearer
* @addr: peer node address
* @linkname: link name output buffer
+ * @len: size of @linkname output buffer
*
- * Returns 0 on success
+ * Return: 0 on success
*/
int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr,
char *linkname, size_t len)
@@ -1627,17 +1661,17 @@ static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list)
return;
default:
return;
- };
+ }
}
/**
- * tipc_node_xmit() is the general link level function for message sending
+ * tipc_node_xmit() - general link level function for message sending
* @net: the applicable net namespace
* @list: chain of buffers containing message
* @dnode: address of destination node
* @selector: a number used for deterministic link selection
* Consumes the buffer chain.
- * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
+ * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
*/
int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
u32 dnode, int selector)
@@ -1870,9 +1904,11 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
/**
* tipc_node_check_state - check and if necessary update node state
+ * @n: target tipc_node
* @skb: TIPC packet
* @bearer_id: identity of bearer delivering the packet
- * Returns true if state and msg are ok, otherwise false
+ * @xmitq: queue for messages to be xmited on
+ * Return: true if state and msg are ok, otherwise false
*/
static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
int bearer_id, struct sk_buff_head *xmitq)
@@ -2170,7 +2206,11 @@ void tipc_node_apply_property(struct net *net, struct tipc_bearer *b,
&xmitq);
else if (prop == TIPC_NLA_PROP_MTU)
tipc_link_set_mtu(e->link, b->mtu);
+
+ /* Update MTU for node link entry */
+ e->mtu = tipc_link_mss(e->link);
}
+
tipc_node_write_unlock(n);
tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL);
}
@@ -2184,6 +2224,9 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct nlattr *attrs[TIPC_NLA_NET_MAX + 1];
struct tipc_node *peer, *temp_node;
+ u8 node_id[NODE_ID_LEN];
+ u64 *w0 = (u64 *)&node_id[0];
+ u64 *w1 = (u64 *)&node_id[8];
u32 addr;
int err;
@@ -2197,10 +2240,22 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
- if (!attrs[TIPC_NLA_NET_ADDR])
- return -EINVAL;
+ /* attrs[TIPC_NLA_NET_NODEID] and attrs[TIPC_NLA_NET_ADDR] are
+ * mutually exclusive cases
+ */
+ if (attrs[TIPC_NLA_NET_ADDR]) {
+ addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]);
+ if (!addr)
+ return -EINVAL;
+ }
- addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]);
+ if (attrs[TIPC_NLA_NET_NODEID]) {
+ if (!attrs[TIPC_NLA_NET_NODEID_W1])
+ return -EINVAL;
+ *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]);
+ *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]);
+ addr = hash128to32(node_id);
+ }
if (in_own_node(net, addr))
return -ENOTSUPP;
@@ -2868,15 +2923,27 @@ static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id)
return 0;
}
+static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv)
+{
+ struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING];
+
+ if (!attr)
+ return -ENODATA;
+
+ *intv = nla_get_u32(attr);
+ return 0;
+}
+
static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1];
struct net *net = sock_net(skb->sk);
- struct tipc_net *tn = tipc_net(net);
+ struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx;
struct tipc_node *n = NULL;
struct tipc_aead_key *ukey;
- struct tipc_crypto *c;
- u8 *id, *own_id;
+ bool rekeying = true, master_key = false;
+ u8 *id, *own_id, mode;
+ u32 intv = 0;
int rc = 0;
if (!info->attrs[TIPC_NLA_NODE])
@@ -2886,52 +2953,66 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
info->attrs[TIPC_NLA_NODE],
tipc_nl_node_policy, info->extack);
if (rc)
- goto exit;
+ return rc;
own_id = tipc_own_id(net);
if (!own_id) {
- rc = -EPERM;
- goto exit;
+ GENL_SET_ERR_MSG(info, "not found own node identity (set id?)");
+ return -EPERM;
}
+ rc = tipc_nl_retrieve_rekeying(attrs, &intv);
+ if (rc == -ENODATA)
+ rekeying = false;
+
rc = tipc_nl_retrieve_key(attrs, &ukey);
- if (rc)
- goto exit;
+ if (rc == -ENODATA && rekeying)
+ goto rekeying;
+ else if (rc)
+ return rc;
- rc = tipc_aead_key_validate(ukey);
+ rc = tipc_aead_key_validate(ukey, info);
if (rc)
- goto exit;
+ return rc;
rc = tipc_nl_retrieve_nodeid(attrs, &id);
switch (rc) {
case -ENODATA:
- /* Cluster key mode */
- rc = tipc_crypto_key_init(tn->crypto_tx, ukey, CLUSTER_KEY);
+ mode = CLUSTER_KEY;
+ master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]);
break;
case 0:
- /* Per-node key mode */
- if (!memcmp(id, own_id, NODE_ID_LEN)) {
- c = tn->crypto_tx;
- } else {
+ mode = PER_NODE_KEY;
+ if (memcmp(id, own_id, NODE_ID_LEN)) {
n = tipc_node_find_by_id(net, id) ?:
tipc_node_create(net, 0, id, 0xffffu, 0, true);
- if (unlikely(!n)) {
- rc = -ENOMEM;
- break;
- }
+ if (unlikely(!n))
+ return -ENOMEM;
c = n->crypto_rx;
}
-
- rc = tipc_crypto_key_init(c, ukey, PER_NODE_KEY);
- if (n)
- tipc_node_put(n);
break;
default:
- break;
+ return rc;
}
-exit:
- return (rc < 0) ? rc : 0;
+ /* Initiate the TX/RX key */
+ rc = tipc_crypto_key_init(c, ukey, mode, master_key);
+ if (n)
+ tipc_node_put(n);
+
+ if (unlikely(rc < 0)) {
+ GENL_SET_ERR_MSG(info, "unable to initiate or attach new key");
+ return rc;
+ } else if (c == tx) {
+ /* Distribute TX key but not master one */
+ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL))
+ GENL_SET_ERR_MSG(info, "failed to replicate new key");
+rekeying:
+ /* Schedule TX rekeying if needed */
+ tipc_crypto_rekeying_sched(tx, rekeying, intv);
+ }
+
+ return 0;
}
int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
@@ -2958,7 +3039,6 @@ static int __tipc_nl_node_flush_key(struct sk_buff *skb,
tipc_crypto_key_flush(n->crypto_rx);
rcu_read_unlock();
- pr_info("All keys are flushed!\n");
return 0;
}
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 9f6f13f1604f..154a5bbb0d29 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -79,12 +79,14 @@ bool tipc_node_get_id(struct net *net, u32 addr, u8 *id);
u32 tipc_node_get_addr(struct tipc_node *node);
char *tipc_node_get_id_str(struct tipc_node *node);
void tipc_node_put(struct tipc_node *node);
+void tipc_node_get(struct tipc_node *node);
struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id,
u16 capabilities, u32 hash_mixes,
bool preliminary);
#ifdef CONFIG_TIPC_CRYPTO
struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n);
struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos);
+struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr);
#endif
u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 11b27ddc75ba..cebcc104dc70 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1,8 +1,9 @@
/*
* net/tipc/socket.c: TIPC socket API
*
- * Copyright (c) 2001-2007, 2012-2017, Ericsson AB
+ * Copyright (c) 2001-2007, 2012-2019, Ericsson AB
* Copyright (c) 2004-2008, 2010-2013, Wind River Systems
+ * Copyright (c) 2020, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -52,10 +53,9 @@
#define NAGLE_START_MAX 1024
#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
#define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */
-#define TIPC_FWD_MSG 1
#define TIPC_MAX_PORT 0xffffffff
#define TIPC_MIN_PORT 1
-#define TIPC_ACK_RATE 4 /* ACK at 1/4 of of rcv window size */
+#define TIPC_ACK_RATE 4 /* ACK at 1/4 of rcv window size */
enum {
TIPC_LISTEN = TCP_LISTEN,
@@ -80,19 +80,32 @@ struct sockaddr_pair {
* @maxnagle: maximum size of msg which can be subject to nagle
* @portid: unique port identity in TIPC socket hash table
* @phdr: preformatted message header used when sending messages
- * #cong_links: list of congested links
+ * @cong_links: list of congested links
* @publications: list of publications for port
* @blocking_link: address of the congested link we are currently sleeping on
* @pub_count: total # of publications port has made during its lifetime
* @conn_timeout: the time we can wait for an unresponded setup request
+ * @probe_unacked: probe has not received ack yet
* @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
* @cong_link_cnt: number of congested links
* @snt_unacked: # messages sent by socket, and not yet acked by peer
+ * @snd_win: send window size
+ * @peer_caps: peer capabilities mask
* @rcv_unacked: # messages read by user, but not yet acked back to peer
+ * @rcv_win: receive window size
* @peer: 'connected' peer for dgram/rdm
* @node: hash table node
* @mc_method: cookie for use between socket and broadcast layer
* @rcu: rcu struct for tipc_sock
+ * @group: TIPC communications group
+ * @oneway: message count in one direction (FIXME)
+ * @nagle_start: current nagle value
+ * @snd_backlog: send backlog count
+ * @msg_acc: messages accepted; used in managing backlog and nagle
+ * @pkt_cnt: TIPC socket packet count
+ * @expect_ack: whether this TIPC socket is expecting an ack
+ * @nodelay: setsockopt() TIPC_NODELAY setting
+ * @group_is_open: TIPC socket group is fully open (FIXME)
*/
struct tipc_sock {
struct sock sk;
@@ -139,9 +152,9 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
bool kern);
static void tipc_sk_timeout(struct timer_list *t);
static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
- struct tipc_name_seq const *seq);
+ struct tipc_service_range const *seq);
static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
- struct tipc_name_seq const *seq);
+ struct tipc_service_range const *seq);
static int tipc_sk_leave(struct tipc_sock *tsk);
static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
static int tipc_sk_insert(struct tipc_sock *tsk);
@@ -261,6 +274,7 @@ static void tsk_set_nagle(struct tipc_sock *tsk)
/**
* tsk_advance_rx_queue - discard first buffer in socket receive queue
+ * @sk: network socket
*
* Caller must hold socket lock
*/
@@ -289,6 +303,8 @@ static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err)
/**
* tsk_rej_rx_queue - reject all buffers in socket receive queue
+ * @sk: network socket
+ * @error: response error code
*
* Caller must hold socket lock
*/
@@ -442,7 +458,7 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout)
* This routine creates additional data structures used by the TIPC socket,
* initializes them, and links them together.
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*/
static int tipc_sk_create(struct net *net, struct socket *sock,
int protocol, int kern)
@@ -607,7 +623,7 @@ static void __tipc_shutdown(struct socket *sock, int error)
* are returned or discarded according to the "destination droppable" setting
* specified for the message by the sender.
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*/
static int tipc_release(struct socket *sock)
{
@@ -645,75 +661,77 @@ static int tipc_release(struct socket *sock)
}
/**
- * tipc_bind - associate or disassocate TIPC name(s) with a socket
+ * __tipc_bind - associate or disassocate TIPC name(s) with a socket
* @sock: socket structure
- * @uaddr: socket address describing name(s) and desired operation
- * @uaddr_len: size of socket address data structure
+ * @skaddr: socket address describing name(s) and desired operation
+ * @alen: size of socket address data structure
*
* Name and name sequence binding is indicated using a positive scope value;
* a negative scope value unbinds the specified name. Specifying no name
* (i.e. a socket address length of 0) unbinds all names from the socket.
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*
* NOTE: This routine doesn't need to take the socket lock since it doesn't
* access any non-constant socket information.
*/
-static int tipc_bind(struct socket *sock, struct sockaddr *uaddr,
- int uaddr_len)
+static int __tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
{
- struct sock *sk = sock->sk;
- struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
- struct tipc_sock *tsk = tipc_sk(sk);
- int res = -EINVAL;
+ struct sockaddr_tipc *addr = (struct sockaddr_tipc *)skaddr;
+ struct tipc_sock *tsk = tipc_sk(sock->sk);
- lock_sock(sk);
- if (unlikely(!uaddr_len)) {
- res = tipc_sk_withdraw(tsk, 0, NULL);
- goto exit;
- }
- if (tsk->group) {
- res = -EACCES;
- goto exit;
- }
- if (uaddr_len < sizeof(struct sockaddr_tipc)) {
- res = -EINVAL;
- goto exit;
- }
- if (addr->family != AF_TIPC) {
- res = -EAFNOSUPPORT;
- goto exit;
- }
+ if (unlikely(!alen))
+ return tipc_sk_withdraw(tsk, 0, NULL);
- if (addr->addrtype == TIPC_ADDR_NAME)
+ if (addr->addrtype == TIPC_SERVICE_ADDR)
addr->addr.nameseq.upper = addr->addr.nameseq.lower;
- else if (addr->addrtype != TIPC_ADDR_NAMESEQ) {
- res = -EAFNOSUPPORT;
- goto exit;
- }
- if ((addr->addr.nameseq.type < TIPC_RESERVED_TYPES) &&
- (addr->addr.nameseq.type != TIPC_TOP_SRV) &&
- (addr->addr.nameseq.type != TIPC_CFG_SRV)) {
- res = -EACCES;
- goto exit;
- }
+ if (tsk->group)
+ return -EACCES;
- res = (addr->scope >= 0) ?
- tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) :
- tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq);
-exit:
- release_sock(sk);
+ if (addr->scope >= 0)
+ return tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq);
+ else
+ return tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq);
+}
+
+int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
+{
+ int res;
+
+ lock_sock(sock->sk);
+ res = __tipc_bind(sock, skaddr, alen);
+ release_sock(sock->sk);
return res;
}
+static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
+{
+ struct sockaddr_tipc *addr = (struct sockaddr_tipc *)skaddr;
+
+ if (alen) {
+ if (alen < sizeof(struct sockaddr_tipc))
+ return -EINVAL;
+ if (addr->family != AF_TIPC)
+ return -EAFNOSUPPORT;
+ if (addr->addrtype > TIPC_SERVICE_ADDR)
+ return -EAFNOSUPPORT;
+ if (addr->addr.nameseq.type < TIPC_RESERVED_TYPES) {
+ pr_warn_once("Can't bind to reserved service type %u\n",
+ addr->addr.nameseq.type);
+ return -EACCES;
+ }
+ }
+ return tipc_sk_bind(sock, skaddr, alen);
+}
+
/**
* tipc_getname - get port ID of socket or peer socket
* @sock: socket structure
* @uaddr: area for returned socket address
* @peer: 0 = own ID, 1 = current peer ID, 2 = current/former peer ID
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*
* NOTE: This routine doesn't need to take the socket lock since it only
* accesses socket information that is unchanging (or which changes in
@@ -738,7 +756,7 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
addr->addr.id.node = tipc_own_addr(sock_net(sk));
}
- addr->addrtype = TIPC_ADDR_ID;
+ addr->addrtype = TIPC_SOCKET_ADDR;
addr->family = AF_TIPC;
addr->scope = 0;
addr->addr.name.domain = 0;
@@ -752,7 +770,7 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
* @sock: socket for which to calculate the poll bits
* @wait: ???
*
- * Returns pollmask value
+ * Return: pollmask value
*
* COMMENTARY:
* It appears that the usual socket locking mechanisms are not useful here
@@ -814,9 +832,9 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
* @timeout: timeout to wait for wakeup
*
* Called from function tipc_sendmsg(), which has done all sanity checks
- * Returns the number of bytes sent on success, or errno
+ * Return: the number of bytes sent on success, or errno
*/
-static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
+static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq,
struct msghdr *msg, size_t dlen, long timeout)
{
struct sock *sk = sock->sk;
@@ -874,6 +892,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
/**
* tipc_send_group_msg - send a message to a member in the group
* @net: network namespace
+ * @tsk: tipc socket
* @m: message to send
* @mb: group member
* @dnode: destination node
@@ -929,7 +948,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk,
* @timeout: timeout to wait for wakeup
*
* Called from function tipc_sendmsg(), which has done all sanity checks
- * Returns the number of bytes sent on success, or errno
+ * Return: the number of bytes sent on success, or errno
*/
static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m,
int dlen, long timeout)
@@ -973,7 +992,7 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m,
* @timeout: timeout to wait for wakeup
*
* Called from function tipc_sendmsg(), which has done all sanity checks
- * Returns the number of bytes sent on success, or errno
+ * Return: the number of bytes sent on success, or errno
*/
static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
int dlen, long timeout)
@@ -1058,7 +1077,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
* @timeout: timeout to wait for wakeup
*
* Called from function tipc_sendmsg(), which has done all sanity checks
- * Returns the number of bytes sent on success, or errno
+ * Return: the number of bytes sent on success, or errno
*/
static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
int dlen, long timeout)
@@ -1132,7 +1151,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
* @timeout: timeout to wait for wakeup
*
* Called from function tipc_sendmsg(), which has done all sanity checks
- * Returns the number of bytes sent on success, or errno
+ * Return: the number of bytes sent on success, or errno
*/
static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m,
int dlen, long timeout)
@@ -1169,6 +1188,7 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m,
/**
* tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets
+ * @net: the associated network namespace
* @arrvq: queue with arriving messages, to be cloned after destination lookup
* @inputq: queue with cloned messages, delivered to socket after dest lookup
*
@@ -1308,6 +1328,8 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack)
* tipc_sk_conn_proto_rcv - receive a connection mng protocol message
* @tsk: receiving socket
* @skb: pointer to message buffer.
+ * @inputq: buffer list containing the buffers
+ * @xmitq: output message area
*/
static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
struct sk_buff_head *inputq,
@@ -1375,7 +1397,7 @@ exit:
* and for 'SYN' messages on SOCK_SEQPACKET and SOCK_STREAM connections.
* (Note: 'SYN+' is prohibited on SOCK_STREAM.)
*
- * Returns the number of bytes sent on success, or errno otherwise
+ * Return: the number of bytes sent on success, or errno otherwise
*/
static int tipc_sendmsg(struct socket *sock,
struct msghdr *m, size_t dsz)
@@ -1401,7 +1423,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
bool syn = !tipc_sk_type_connectionless(sk);
struct tipc_group *grp = tsk->group;
struct tipc_msg *hdr = &tsk->phdr;
- struct tipc_name_seq *seq;
+ struct tipc_service_range *seq;
struct sk_buff_head pkts;
u32 dport = 0, dnode = 0;
u32 type = 0, inst = 0;
@@ -1420,9 +1442,9 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
if (grp) {
if (!dest)
return tipc_send_group_bcast(sock, m, dlen, timeout);
- if (dest->addrtype == TIPC_ADDR_NAME)
+ if (dest->addrtype == TIPC_SERVICE_ADDR)
return tipc_send_group_anycast(sock, m, dlen, timeout);
- if (dest->addrtype == TIPC_ADDR_ID)
+ if (dest->addrtype == TIPC_SOCKET_ADDR)
return tipc_send_group_unicast(sock, m, dlen, timeout);
if (dest->addrtype == TIPC_ADDR_MCAST)
return tipc_send_group_mcast(sock, m, dlen, timeout);
@@ -1442,7 +1464,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
return -EISCONN;
if (tsk->published)
return -EOPNOTSUPP;
- if (dest->addrtype == TIPC_ADDR_NAME) {
+ if (dest->addrtype == TIPC_SERVICE_ADDR) {
tsk->conn_type = dest->addr.name.name.type;
tsk->conn_instance = dest->addr.name.name.instance;
}
@@ -1453,14 +1475,14 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
if (dest->addrtype == TIPC_ADDR_MCAST)
return tipc_sendmcast(sock, seq, m, dlen, timeout);
- if (dest->addrtype == TIPC_ADDR_NAME) {
+ if (dest->addrtype == TIPC_SERVICE_ADDR) {
type = dest->addr.name.name.type;
inst = dest->addr.name.name.instance;
dnode = dest->addr.name.domain;
dport = tipc_nametbl_translate(net, type, inst, &dnode);
if (unlikely(!dport && !dnode))
return -EHOSTUNREACH;
- } else if (dest->addrtype == TIPC_ADDR_ID) {
+ } else if (dest->addrtype == TIPC_SOCKET_ADDR) {
dnode = dest->addr.id.node;
} else {
return -EINVAL;
@@ -1472,7 +1494,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
if (unlikely(rc))
return rc;
- if (dest->addrtype == TIPC_ADDR_NAME) {
+ if (dest->addrtype == TIPC_SERVICE_ADDR) {
msg_set_type(hdr, TIPC_NAMED_MSG);
msg_set_hdr_sz(hdr, NAMED_H_SIZE);
msg_set_nametype(hdr, type);
@@ -1480,7 +1502,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
msg_set_lookup_scope(hdr, tipc_node2scope(dnode));
msg_set_destnode(hdr, dnode);
msg_set_destport(hdr, dport);
- } else { /* TIPC_ADDR_ID */
+ } else { /* TIPC_SOCKET_ADDR */
msg_set_type(hdr, TIPC_DIRECT_MSG);
msg_set_lookup_scope(hdr, 0);
msg_set_destnode(hdr, dnode);
@@ -1520,7 +1542,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
*
* Used for SOCK_STREAM data.
*
- * Returns the number of bytes sent on success (or partial success),
+ * Return: the number of bytes sent on success (or partial success),
* or errno if no data sent
*/
static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz)
@@ -1628,7 +1650,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
*
* Used for SOCK_SEQPACKET messages.
*
- * Returns the number of bytes sent on success, or errno otherwise
+ * Return: the number of bytes sent on success, or errno otherwise
*/
static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz)
{
@@ -1685,7 +1707,7 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb)
return;
srcaddr->sock.family = AF_TIPC;
- srcaddr->sock.addrtype = TIPC_ADDR_ID;
+ srcaddr->sock.addrtype = TIPC_SOCKET_ADDR;
srcaddr->sock.scope = 0;
srcaddr->sock.addr.id.ref = msg_origport(hdr);
srcaddr->sock.addr.id.node = msg_orignode(hdr);
@@ -1697,7 +1719,7 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb)
/* Group message users may also want to know sending member's id */
srcaddr->member.family = AF_TIPC;
- srcaddr->member.addrtype = TIPC_ADDR_NAME;
+ srcaddr->member.addrtype = TIPC_SERVICE_ADDR;
srcaddr->member.scope = 0;
srcaddr->member.addr.name.name.type = msg_nametype(hdr);
srcaddr->member.addr.name.name.instance = TIPC_SKB_CB(skb)->orig_member;
@@ -1713,7 +1735,7 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb)
*
* Note: Ancillary data is not captured if not requested by receiver.
*
- * Returns 0 if successful, otherwise errno
+ * Return: 0 if successful, otherwise errno
*/
static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb,
struct tipc_sock *tsk)
@@ -1863,6 +1885,7 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
/**
* tipc_recvmsg - receive packet-oriented message
+ * @sock: network socket
* @m: descriptor for message info
* @buflen: length of user buffer area
* @flags: receive flags
@@ -1870,7 +1893,7 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
* Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages.
* If the complete message doesn't fit in user area, truncate it.
*
- * Returns size of returned message data, errno otherwise
+ * Return: size of returned message data, errno otherwise
*/
static int tipc_recvmsg(struct socket *sock, struct msghdr *m,
size_t buflen, int flags)
@@ -1971,6 +1994,7 @@ exit:
/**
* tipc_recvstream - receive stream-oriented data
+ * @sock: network socket
* @m: descriptor for message info
* @buflen: total size of user buffer area
* @flags: receive flags
@@ -1978,7 +2002,7 @@ exit:
* Used for SOCK_STREAM messages only. If not enough data is available
* will optionally wait for more; never truncates data.
*
- * Returns size of returned message data, errno otherwise
+ * Return: size of returned message data, errno otherwise
*/
static int tipc_recvstream(struct socket *sock, struct msghdr *m,
size_t buflen, int flags)
@@ -2156,7 +2180,7 @@ static void tipc_sk_proto_rcv(struct sock *sk,
* @tsk: TIPC socket
* @skb: pointer to message buffer.
* @xmitq: for Nagle ACK if any
- * Returns true if message should be added to receive queue, false otherwise
+ * Return: true if message should be added to receive queue, false otherwise
*/
static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb,
struct sk_buff_head *xmitq)
@@ -2270,7 +2294,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb,
* TIPC_HIGH_IMPORTANCE (8 MB)
* TIPC_CRITICAL_IMPORTANCE (16 MB)
*
- * Returns overload limit according to corresponding message importance
+ * Return: overload limit according to corresponding message importance
*/
static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb)
{
@@ -2293,12 +2317,12 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb)
* tipc_sk_filter_rcv - validate incoming message
* @sk: socket
* @skb: pointer to message.
+ * @xmitq: output message area (FIXME)
*
* Enqueues message on receive queue if acceptable; optionally handles
* disconnect indication for a connected socket.
*
* Called with socket lock already taken
- *
*/
static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
struct sk_buff_head *xmitq)
@@ -2388,6 +2412,7 @@ static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
* @inputq: list of incoming buffers with potentially different destinations
* @sk: socket where the buffers should be enqueued
* @dport: port number for the socket
+ * @xmitq: output queue
*
* Caller must hold socket lock
*/
@@ -2440,6 +2465,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
/**
* tipc_sk_rcv - handle a chain of incoming buffers
+ * @net: the associated network namespace
* @inputq: buffer list containing the buffers
* Consumes all buffers in list until inputq is empty
* Note: may be called in multiple threads referring to the same queue
@@ -2532,7 +2558,7 @@ static bool tipc_sockaddr_is_sane(struct sockaddr_tipc *addr)
* @destlen: size of socket address data structure
* @flags: file-related flags associated with socket
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*/
static int tipc_connect(struct socket *sock, struct sockaddr *dest,
int destlen, int flags)
@@ -2625,7 +2651,7 @@ exit:
* @sock: socket structure
* @len: (unused)
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*/
static int tipc_listen(struct socket *sock, int len)
{
@@ -2677,8 +2703,9 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo)
* @sock: listening socket
* @new_sock: new socket that is to be connected
* @flags: file-related flags associated with socket
+ * @kern: caused by kernel or by userspace?
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*/
static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
bool kern)
@@ -2757,7 +2784,7 @@ exit:
*
* Terminates connection (if necessary), then purges socket's receive queue.
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*/
static int tipc_shutdown(struct socket *sock, int how)
{
@@ -2865,7 +2892,7 @@ static void tipc_sk_timeout(struct timer_list *t)
}
static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
- struct tipc_name_seq const *seq)
+ struct tipc_service_range const *seq)
{
struct sock *sk = &tsk->sk;
struct net *net = sock_net(sk);
@@ -2893,7 +2920,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
}
static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
- struct tipc_name_seq const *seq)
+ struct tipc_service_range const *seq)
{
struct net *net = sock_net(&tsk->sk);
struct publication *publ;
@@ -3040,7 +3067,7 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
struct net *net = sock_net(&tsk->sk);
struct tipc_group *grp = tsk->group;
struct tipc_msg *hdr = &tsk->phdr;
- struct tipc_name_seq seq;
+ struct tipc_service_range seq;
int rc;
if (mreq->type < TIPC_RESERVED_TYPES)
@@ -3077,7 +3104,7 @@ static int tipc_sk_leave(struct tipc_sock *tsk)
{
struct net *net = sock_net(&tsk->sk);
struct tipc_group *grp = tsk->group;
- struct tipc_name_seq seq;
+ struct tipc_service_range seq;
int scope;
if (!grp)
@@ -3100,7 +3127,7 @@ static int tipc_sk_leave(struct tipc_sock *tsk)
* For stream sockets only, accepts and ignores all IPPROTO_TCP options
* (to ease compatibility).
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*/
static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
sockptr_t ov, unsigned int ol)
@@ -3194,14 +3221,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
* For stream sockets only, returns 0 length result for all IPPROTO_TCP options
* (to ease compatibility).
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*/
static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
char __user *ov, int __user *ol)
{
struct sock *sk = sock->sk;
struct tipc_sock *tsk = tipc_sk(sk);
- struct tipc_name_seq seq;
+ struct tipc_service_range seq;
int len, scope;
u32 value;
int res;
@@ -3302,12 +3329,12 @@ static int tipc_socketpair(struct socket *sock1, struct socket *sock2)
u32 onode = tipc_own_addr(sock_net(sock1->sk));
tsk1->peer.family = AF_TIPC;
- tsk1->peer.addrtype = TIPC_ADDR_ID;
+ tsk1->peer.addrtype = TIPC_SOCKET_ADDR;
tsk1->peer.scope = TIPC_NODE_SCOPE;
tsk1->peer.addr.id.ref = tsk2->portid;
tsk1->peer.addr.id.node = onode;
tsk2->peer.family = AF_TIPC;
- tsk2->peer.addrtype = TIPC_ADDR_ID;
+ tsk2->peer.addrtype = TIPC_SOCKET_ADDR;
tsk2->peer.scope = TIPC_NODE_SCOPE;
tsk2->peer.addr.id.ref = tsk1->portid;
tsk2->peer.addr.id.node = onode;
@@ -3398,7 +3425,7 @@ static struct proto tipc_proto = {
/**
* tipc_socket_init - initialize TIPC socket interface
*
- * Returns 0 on success, errno otherwise
+ * Return: 0 on success, errno otherwise
*/
int tipc_socket_init(void)
{
@@ -3797,10 +3824,11 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb)
/**
* tipc_sk_filtering - check if a socket should be traced
* @sk: the socket to be examined
- * @sysctl_tipc_sk_filter[]: the socket tuple for filtering,
- * (portid, sock type, name type, name lower, name upper)
*
- * Returns true if the socket meets the socket tuple data
+ * @sysctl_tipc_sk_filter is used as the socket tuple for filtering:
+ * (portid, sock type, name type, name lower, name upper)
+ *
+ * Return: true if the socket meets the socket tuple data
* (value 0 = 'any') or when there is no tuple set (all = 0),
* otherwise false
*/
@@ -3865,7 +3893,7 @@ u32 tipc_sock_get_portid(struct sock *sk)
* @sk: tipc sk to be checked
* @skb: tipc msg to be checked
*
- * Returns true if the socket rx queue allocation is > 90%, otherwise false
+ * Return: true if the socket rx queue allocation is > 90%, otherwise false
*/
bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb)
@@ -3883,7 +3911,7 @@ bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb)
* @sk: tipc sk to be checked
* @skb: tipc msg to be checked
*
- * Returns true if the socket rx queue allocation is > 90%, otherwise false
+ * Return: true if the socket rx queue allocation is > 90%, otherwise false
*/
bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb)
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
index b11575afc66f..02cdf166807d 100644
--- a/net/tipc/socket.h
+++ b/net/tipc/socket.h
@@ -74,7 +74,7 @@ int tipc_dump_done(struct netlink_callback *cb);
u32 tipc_sock_get_portid(struct sock *sk);
bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb);
bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb);
-
+int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen);
int tsk_set_importance(struct sock *sk, int imp);
#endif
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index f340e53da625..f6ad0005218c 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -3,6 +3,7 @@
*
* Copyright (c) 2000-2017, Ericsson AB
* Copyright (c) 2005-2007, 2010-2013, Wind River Systems
+ * Copyright (c) 2020, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -55,12 +56,14 @@ static void tipc_sub_send_event(struct tipc_subscription *sub,
}
/**
- * tipc_sub_check_overlap - test for subscription overlap with the
- * given values
+ * tipc_sub_check_overlap - test for subscription overlap with the given values
+ * @seq: tipc_name_seq to check
+ * @found_lower: lower value to test
+ * @found_upper: upper value to test
*
- * Returns 1 if there is overlap, otherwise 0.
+ * Return: 1 if there is overlap, otherwise 0.
*/
-int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
+int tipc_sub_check_overlap(struct tipc_service_range *seq, u32 found_lower,
u32 found_upper)
{
if (found_lower < seq->lower)
@@ -79,7 +82,7 @@ void tipc_sub_report_overlap(struct tipc_subscription *sub,
{
struct tipc_subscr *s = &sub->evt.s;
u32 filter = tipc_sub_read(s, filter);
- struct tipc_name_seq seq;
+ struct tipc_service_range seq;
seq.type = tipc_sub_read(s, seq.type);
seq.lower = tipc_sub_read(s, seq.lower);
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 6ebbec1bedd1..3ded27391d54 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -3,6 +3,7 @@
*
* Copyright (c) 2003-2017, Ericsson AB
* Copyright (c) 2005-2007, 2012-2013, Wind River Systems
+ * Copyright (c) 2020, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -47,12 +48,15 @@ struct tipc_conn;
/**
* struct tipc_subscription - TIPC network topology subscription object
- * @subscriber: pointer to its subscriber
- * @seq: name sequence associated with subscription
+ * @kref: reference count for this subscription
+ * @net: network namespace associated with subscription
* @timer: timer governing subscription duration (optional)
- * @nameseq_list: adjacent subscriptions in name sequence's subscription list
+ * @service_list: adjacent subscriptions in name sequence's subscription list
* @sub_list: adjacent subscriptions in subscriber's subscription list
* @evt: template for events generated by subscription
+ * @conid: connection identifier of topology server
+ * @inactive: true if this subscription is inactive
+ * @lock: serialize up/down and timer events
*/
struct tipc_subscription {
struct kref kref;
@@ -63,7 +67,7 @@ struct tipc_subscription {
struct tipc_event evt;
int conid;
bool inactive;
- spinlock_t lock; /* serialize up/down and timer events */
+ spinlock_t lock;
};
struct tipc_subscription *tipc_sub_subscribe(struct net *net,
@@ -71,8 +75,8 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net,
int conid);
void tipc_sub_unsubscribe(struct tipc_subscription *sub);
-int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
- u32 found_upper);
+int tipc_sub_check_overlap(struct tipc_service_range *seq,
+ u32 found_lower, u32 found_upper);
void tipc_sub_report_overlap(struct tipc_subscription *sub,
u32 found_lower, u32 found_upper,
u32 event, u32 port, u32 node,
diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c
index 97a6264a2993..9fb65c988f7f 100644
--- a/net/tipc/sysctl.c
+++ b/net/tipc/sysctl.c
@@ -74,6 +74,15 @@ static struct ctl_table tipc_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
+ {
+ .procname = "key_exchange_enabled",
+ .data = &sysctl_tipc_key_exchange_enabled,
+ .maxlen = sizeof(sysctl_tipc_key_exchange_enabled),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
#endif
{
.procname = "bc_retruni",
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index 1489cfb941d8..5522865deae9 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -48,7 +48,6 @@
#define MAX_SEND_MSG_COUNT 25
#define MAX_RECV_MSG_COUNT 25
#define CF_CONNECTED 1
-#define CF_SERVER 2
#define TIPC_SERVER_NAME_LEN 32
@@ -520,13 +519,13 @@ static int tipc_topsrv_create_listener(struct tipc_topsrv *srv)
goto err;
saddr.family = AF_TIPC;
- saddr.addrtype = TIPC_ADDR_NAMESEQ;
- saddr.addr.nameseq.type = TIPC_TOP_SRV;
+ saddr.addrtype = TIPC_SERVICE_RANGE;
+ saddr.addr.nameseq.type = TIPC_TOP_SRV;
saddr.addr.nameseq.lower = TIPC_TOP_SRV;
saddr.addr.nameseq.upper = TIPC_TOP_SRV;
saddr.scope = TIPC_NODE_SCOPE;
- rc = kernel_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr));
+ rc = tipc_sk_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr));
if (rc < 0)
goto err;
rc = kernel_listen(lsock, 0);
@@ -665,12 +664,18 @@ static int tipc_topsrv_start(struct net *net)
ret = tipc_topsrv_work_start(srv);
if (ret < 0)
- return ret;
+ goto err_start;
ret = tipc_topsrv_create_listener(srv);
if (ret < 0)
- tipc_topsrv_work_stop(srv);
+ goto err_create;
+ return 0;
+
+err_create:
+ tipc_topsrv_work_stop(srv);
+err_start:
+ kfree(srv);
return ret;
}
diff --git a/net/tipc/trace.c b/net/tipc/trace.c
index 265f6a26aa3d..7d2931521e0e 100644
--- a/net/tipc/trace.c
+++ b/net/tipc/trace.c
@@ -36,7 +36,7 @@
#define CREATE_TRACE_POINTS
#include "trace.h"
-/**
+/*
* socket tuples for filtering in socket traces:
* (portid, sock type, name type, name lower, name upper)
*/
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 911d13cd2e67..21e75e28e86a 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -52,6 +52,7 @@
#include "bearer.h"
#include "netlink.h"
#include "msg.h"
+#include "udp_media.h"
/* IANA assigned UDP port */
#define UDP_PORT_DEFAULT 6118
@@ -63,6 +64,11 @@
*
* This is the bearer level originating address used in neighbor discovery
* messages, and all fields should be in network byte order
+ *
+ * @proto: Ethernet protocol in use
+ * @port: port being used
+ * @ipv4: IPv4 address of neighbor
+ * @ipv6: IPv6 address of neighbor
*/
struct udp_media_addr {
__be16 proto;
@@ -87,6 +93,7 @@ struct udp_replicast {
* @ubsock: bearer associated socket
* @ifindex: local address scope
* @work: used to schedule deferred work on a bearer
+ * @rcast: associated udp_replicast container
*/
struct udp_bearer {
struct tipc_bearer __rcu *bearer;
@@ -771,7 +778,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
if (err)
goto free;
- /**
+ /*
* The bcast media address port is used for all peers and the ip
* is used if it's a multicast address.
*/
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index b74e2741f74f..f7fb7d2c1de1 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -327,7 +327,7 @@ static int tls_device_record_close(struct sock *sk,
/* fill prepend */
tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]),
record->len - prot->overhead_size,
- record_type, prot->version);
+ record_type);
return ret;
}
@@ -418,14 +418,14 @@ static int tls_push_data(struct sock *sk,
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
- int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
struct tls_record_info *record = ctx->open_record;
int tls_push_record_flags;
struct page_frag *pfrag;
size_t orig_size = size;
u32 max_open_record_len;
- int copy, rc = 0;
+ bool more = false;
bool done = false;
+ int copy, rc = 0;
long timeo;
if (flags &
@@ -492,9 +492,8 @@ handle_error:
if (!size) {
last_record:
tls_push_record_flags = flags;
- if (more) {
- tls_ctx->pending_open_record_frags =
- !!record->num_frags;
+ if (flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE)) {
+ more = true;
break;
}
@@ -526,6 +525,8 @@ last_record:
}
} while (!done);
+ tls_ctx->pending_open_record_frags = more;
+
if (orig_size - size > 0)
rc = orig_size - size;
@@ -693,36 +694,51 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx,
static bool
tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async,
- s64 resync_req, u32 *seq)
+ s64 resync_req, u32 *seq, u16 *rcd_delta)
{
u32 is_async = resync_req & RESYNC_REQ_ASYNC;
u32 req_seq = resync_req >> 32;
u32 req_end = req_seq + ((resync_req >> 16) & 0xffff);
+ u16 i;
+
+ *rcd_delta = 0;
if (is_async) {
+ /* shouldn't get to wraparound:
+ * too long in async stage, something bad happened
+ */
+ if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX))
+ return false;
+
/* asynchronous stage: log all headers seq such that
* req_seq <= seq <= end_seq, and wait for real resync request
*/
- if (between(*seq, req_seq, req_end) &&
+ if (before(*seq, req_seq))
+ return false;
+ if (!after(*seq, req_end) &&
resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX)
resync_async->log[resync_async->loglen++] = *seq;
+ resync_async->rcd_delta++;
+
return false;
}
/* synchronous stage: check against the logged entries and
* proceed to check the next entries if no match was found
*/
- while (resync_async->loglen) {
- if (req_seq == resync_async->log[resync_async->loglen - 1] &&
- atomic64_try_cmpxchg(&resync_async->req,
- &resync_req, 0)) {
- resync_async->loglen = 0;
+ for (i = 0; i < resync_async->loglen; i++)
+ if (req_seq == resync_async->log[i] &&
+ atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) {
+ *rcd_delta = resync_async->rcd_delta - i;
*seq = req_seq;
+ resync_async->loglen = 0;
+ resync_async->rcd_delta = 0;
return true;
}
- resync_async->loglen--;
- }
+
+ resync_async->loglen = 0;
+ resync_async->rcd_delta = 0;
if (req_seq == *seq &&
atomic64_try_cmpxchg(&resync_async->req,
@@ -740,6 +756,7 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
u32 sock_data, is_req_pending;
struct tls_prot_info *prot;
s64 resync_req;
+ u16 rcd_delta;
u32 req_seq;
if (tls_ctx->rx_conf != TLS_HW)
@@ -785,8 +802,9 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
return;
if (!tls_device_rx_resync_async(rx_ctx->resync_async,
- resync_req, &seq))
+ resync_req, &seq, &rcd_delta))
return;
+ tls_bigint_subtract(rcd_sn, rcd_delta);
break;
}
@@ -980,7 +998,7 @@ static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
{
- u16 nonce_size, tag_size, iv_size, rec_seq_size;
+ u16 nonce_size, tag_size, iv_size, rec_seq_size, salt_size;
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_record_info *start_marker_record;
@@ -1021,6 +1039,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv;
rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE;
+ salt_size = TLS_CIPHER_AES_GCM_128_SALT_SIZE;
rec_seq =
((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq;
break;
@@ -1041,6 +1060,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
prot->tag_size = tag_size;
prot->overhead_size = prot->prepend_size + prot->tag_size;
prot->iv_size = iv_size;
+ prot->salt_size = salt_size;
ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
GFP_KERNEL);
if (!ctx->tx.iv) {
@@ -1244,6 +1264,8 @@ void tls_device_offload_cleanup_rx(struct sock *sk)
if (tls_ctx->tx_conf != TLS_HW) {
dev_put(netdev);
tls_ctx->netdev = NULL;
+ } else {
+ set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags);
}
out:
up_read(&device_offload_lock);
@@ -1273,7 +1295,8 @@ static int tls_device_down(struct net_device *netdev)
if (ctx->tx_conf == TLS_HW)
netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
TLS_OFFLOAD_CTX_DIR_TX);
- if (ctx->rx_conf == TLS_HW)
+ if (ctx->rx_conf == TLS_HW &&
+ !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags))
netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
TLS_OFFLOAD_CTX_DIR_RX);
WRITE_ONCE(ctx->netdev, NULL);
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 28895333701e..d946817ed065 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -49,7 +49,8 @@ static int tls_enc_record(struct aead_request *aead_req,
struct crypto_aead *aead, char *aad,
char *iv, __be64 rcd_sn,
struct scatter_walk *in,
- struct scatter_walk *out, int *in_len)
+ struct scatter_walk *out, int *in_len,
+ struct tls_prot_info *prot)
{
unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE];
struct scatterlist sg_in[3];
@@ -73,8 +74,7 @@ static int tls_enc_record(struct aead_request *aead_req,
len -= TLS_CIPHER_AES_GCM_128_IV_SIZE;
tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE,
- (char *)&rcd_sn, sizeof(rcd_sn), buf[0],
- TLS_1_2_VERSION);
+ (char *)&rcd_sn, buf[0], prot);
memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE,
TLS_CIPHER_AES_GCM_128_IV_SIZE);
@@ -140,7 +140,7 @@ static struct aead_request *tls_alloc_aead_request(struct crypto_aead *aead,
static int tls_enc_records(struct aead_request *aead_req,
struct crypto_aead *aead, struct scatterlist *sg_in,
struct scatterlist *sg_out, char *aad, char *iv,
- u64 rcd_sn, int len)
+ u64 rcd_sn, int len, struct tls_prot_info *prot)
{
struct scatter_walk out, in;
int rc;
@@ -150,7 +150,7 @@ static int tls_enc_records(struct aead_request *aead_req,
do {
rc = tls_enc_record(aead_req, aead, aad, iv,
- cpu_to_be64(rcd_sn), &in, &out, &len);
+ cpu_to_be64(rcd_sn), &in, &out, &len, prot);
rcd_sn++;
} while (rc == 0 && len);
@@ -348,7 +348,8 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
payload_len, sync_size, dummy_buf);
if (tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, aad, iv,
- rcd_sn, sync_size + payload_len) < 0)
+ rcd_sn, sync_size + payload_len,
+ &tls_ctx->prot_info) < 0)
goto free_nskb;
complete_skb(nskb, skb, tcp_payload_offset);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index bbc52b088d29..47b7c5334c34 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -330,12 +330,13 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
tls_ctx_free(sk, ctx);
}
-static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
- int __user *optlen)
+static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval,
+ int __user *optlen, int tx)
{
int rc = 0;
struct tls_context *ctx = tls_get_ctx(sk);
struct tls_crypto_info *crypto_info;
+ struct cipher_context *cctx;
int len;
if (get_user(len, optlen))
@@ -352,7 +353,13 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
}
/* get user crypto info */
- crypto_info = &ctx->crypto_send.info;
+ if (tx) {
+ crypto_info = &ctx->crypto_send.info;
+ cctx = &ctx->tx;
+ } else {
+ crypto_info = &ctx->crypto_recv.info;
+ cctx = &ctx->rx;
+ }
if (!TLS_CRYPTO_INFO_READY(crypto_info)) {
rc = -EBUSY;
@@ -379,9 +386,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
}
lock_sock(sk);
memcpy(crypto_info_aes_gcm_128->iv,
- ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+ cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
TLS_CIPHER_AES_GCM_128_IV_SIZE);
- memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq,
+ memcpy(crypto_info_aes_gcm_128->rec_seq, cctx->rec_seq,
TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
release_sock(sk);
if (copy_to_user(optval,
@@ -403,9 +410,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
}
lock_sock(sk);
memcpy(crypto_info_aes_gcm_256->iv,
- ctx->tx.iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE,
+ cctx->iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE,
TLS_CIPHER_AES_GCM_256_IV_SIZE);
- memcpy(crypto_info_aes_gcm_256->rec_seq, ctx->tx.rec_seq,
+ memcpy(crypto_info_aes_gcm_256->rec_seq, cctx->rec_seq,
TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE);
release_sock(sk);
if (copy_to_user(optval,
@@ -429,7 +436,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
switch (optname) {
case TLS_TX:
- rc = do_tls_getsockopt_tx(sk, optval, optlen);
+ case TLS_RX:
+ rc = do_tls_getsockopt_conf(sk, optval, optlen,
+ optname == TLS_TX);
break;
default:
rc = -ENOPROTOOPT;
@@ -512,6 +521,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
case TLS_CIPHER_AES_CCM_128:
optsize = sizeof(struct tls12_crypto_info_aes_ccm_128);
break;
+ case TLS_CIPHER_CHACHA20_POLY1305:
+ optsize = sizeof(struct tls12_crypto_info_chacha20_poly1305);
+ break;
default:
rc = -EINVAL;
goto err_crypto_info;
@@ -860,7 +872,7 @@ static int __init tls_register(void)
tls_sw_proto_ops = inet_stream_ops;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
- tls_sw_proto_ops.sendpage_locked = tls_sw_sendpage_locked,
+ tls_sw_proto_ops.sendpage_locked = tls_sw_sendpage_locked;
tls_device_init();
tcp_register_ulp(&tcp_tls_ulp_ops);
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 3a5dd1e07233..feeceb0e4cb4 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -37,9 +37,12 @@ static int tls_statistics_seq_show(struct seq_file *seq, void *v)
int __net_init tls_proc_init(struct net *net)
{
+#ifdef CONFIG_PROC_FS
if (!proc_create_net_single("tls_stat", 0444, net->proc_net,
tls_statistics_seq_show, NULL))
return -ENOMEM;
+#endif /* CONFIG_PROC_FS */
+
return 0;
}
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 95ab5545a931..01d933ae5f16 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -505,7 +505,7 @@ static int tls_do_encryption(struct sock *sk,
memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv,
prot->iv_size + prot->salt_size);
- xor_iv_with_seq(prot->version, rec->iv_data, tls_ctx->tx.rec_seq);
+ xor_iv_with_seq(prot, rec->iv_data, tls_ctx->tx.rec_seq);
sge->offset += prot->prepend_size;
sge->length -= prot->prepend_size;
@@ -748,14 +748,13 @@ static int tls_push_record(struct sock *sk, int flags,
sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]);
tls_make_aad(rec->aad_space, msg_pl->sg.size + prot->tail_size,
- tls_ctx->tx.rec_seq, prot->rec_seq_size,
- record_type, prot->version);
+ tls_ctx->tx.rec_seq, record_type, prot);
tls_fill_prepend(tls_ctx,
page_address(sg_page(&msg_en->sg.data[i])) +
msg_en->sg.data[i].offset,
msg_pl->sg.size + prot->tail_size,
- record_type, prot->version);
+ record_type);
tls_ctx->pending_open_record_frags = false;
@@ -1295,6 +1294,12 @@ static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock,
return NULL;
}
+ if (!skb_queue_empty(&sk->sk_receive_queue)) {
+ __strp_unpause(&ctx->strp);
+ if (ctx->recv_pkt)
+ return ctx->recv_pkt;
+ }
+
if (sk->sk_shutdown & RCV_SHUTDOWN)
return NULL;
@@ -1465,19 +1470,19 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
kfree(mem);
return err;
}
- if (prot->version == TLS_1_3_VERSION)
+ if (prot->version == TLS_1_3_VERSION ||
+ prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305)
memcpy(iv + iv_offset, tls_ctx->rx.iv,
crypto_aead_ivsize(ctx->aead_recv));
else
memcpy(iv + iv_offset, tls_ctx->rx.iv, prot->salt_size);
- xor_iv_with_seq(prot->version, iv, tls_ctx->rx.rec_seq);
+ xor_iv_with_seq(prot, iv, tls_ctx->rx.rec_seq);
/* Prepare AAD */
tls_make_aad(aad, rxm->full_len - prot->overhead_size +
prot->tail_size,
- tls_ctx->rx.rec_seq, prot->rec_seq_size,
- ctx->control, prot->version);
+ tls_ctx->rx.rec_seq, ctx->control, prot);
/* Prepare sgin */
sg_init_table(sgin, n_sgin);
@@ -1913,7 +1918,7 @@ pick_next_record:
* another message type
*/
msg->msg_flags |= MSG_EOR;
- if (ctx->control != TLS_RECORD_TYPE_DATA)
+ if (control != TLS_RECORD_TYPE_DATA)
goto recv_end;
} else {
break;
@@ -2070,7 +2075,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
data_len = ((header[4] & 0xFF) | (header[3] << 8));
cipher_overhead = prot->tag_size;
- if (prot->version != TLS_1_3_VERSION)
+ if (prot->version != TLS_1_3_VERSION &&
+ prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305)
cipher_overhead += prot->iv_size;
if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead +
@@ -2290,6 +2296,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
struct tls12_crypto_info_aes_gcm_256 *gcm_256_info;
struct tls12_crypto_info_aes_ccm_128 *ccm_128_info;
+ struct tls12_crypto_info_chacha20_poly1305 *chacha20_poly1305_info;
struct tls_sw_context_tx *sw_ctx_tx = NULL;
struct tls_sw_context_rx *sw_ctx_rx = NULL;
struct cipher_context *cctx;
@@ -2402,6 +2409,21 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
cipher_name = "ccm(aes)";
break;
}
+ case TLS_CIPHER_CHACHA20_POLY1305: {
+ chacha20_poly1305_info = (void *)crypto_info;
+ nonce_size = 0;
+ tag_size = TLS_CIPHER_CHACHA20_POLY1305_TAG_SIZE;
+ iv_size = TLS_CIPHER_CHACHA20_POLY1305_IV_SIZE;
+ iv = chacha20_poly1305_info->iv;
+ rec_seq_size = TLS_CIPHER_CHACHA20_POLY1305_REC_SEQ_SIZE;
+ rec_seq = chacha20_poly1305_info->rec_seq;
+ keysize = TLS_CIPHER_CHACHA20_POLY1305_KEY_SIZE;
+ key = chacha20_poly1305_info->key;
+ salt = chacha20_poly1305_info->salt;
+ salt_size = TLS_CIPHER_CHACHA20_POLY1305_SALT_SIZE;
+ cipher_name = "rfc7539(chacha20,poly1305)";
+ break;
+ }
default:
rc = -EINVAL;
goto free_priv;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 92784e51ee7d..41c3303c3357 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -613,7 +613,6 @@ static int unix_listen(struct socket *sock, int backlog)
int err;
struct sock *sk = sock->sk;
struct unix_sock *u = unix_sk(sk);
- struct pid *old_pid = NULL;
err = -EOPNOTSUPP;
if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
@@ -634,7 +633,6 @@ static int unix_listen(struct socket *sock, int backlog)
out_unlock:
unix_state_unlock(sk);
- put_pid(old_pid);
out:
return err;
}
@@ -878,7 +876,6 @@ static int unix_autobind(struct socket *sock)
if (err)
return err;
- err = 0;
if (u->addr)
goto out;
diff --git a/net/unix/scm.c b/net/unix/scm.c
index 8c40f2b32392..052ae709ce28 100644
--- a/net/unix/scm.c
+++ b/net/unix/scm.c
@@ -8,6 +8,7 @@
#include <net/af_unix.h>
#include <net/scm.h>
#include <linux/init.h>
+#include <linux/io_uring.h>
#include "scm.h"
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 9e93bc201cc0..5546710d8ac1 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -421,7 +421,8 @@ static void vsock_deassign_transport(struct vsock_sock *vsk)
* The vsk->remote_addr is used to decide which transport to use:
* - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if
* g2h is not loaded, will use local transport;
- * - remote CID <= VMADDR_CID_HOST will use guest->host transport;
+ * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field
+ * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport;
* - remote CID > VMADDR_CID_HOST will use host->guest transport;
*/
int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
@@ -429,8 +430,23 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
const struct vsock_transport *new_transport;
struct sock *sk = sk_vsock(vsk);
unsigned int remote_cid = vsk->remote_addr.svm_cid;
+ __u8 remote_flags;
int ret;
+ /* If the packet is coming with the source and destination CIDs higher
+ * than VMADDR_CID_HOST, then a vsock channel where all the packets are
+ * forwarded to the host should be established. Then the host will
+ * need to forward the packets to the guest.
+ *
+ * The flag is set on the (listen) receive path (psk is not NULL). On
+ * the connect path the flag can be set by the user space application.
+ */
+ if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST &&
+ vsk->remote_addr.svm_cid > VMADDR_CID_HOST)
+ vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST;
+
+ remote_flags = vsk->remote_addr.svm_flags;
+
switch (sk->sk_type) {
case SOCK_DGRAM:
new_transport = transport_dgram;
@@ -438,7 +454,8 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
case SOCK_STREAM:
if (vsock_use_local_transport(remote_cid))
new_transport = transport_local;
- else if (remote_cid <= VMADDR_CID_HOST)
+ else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g ||
+ (remote_flags & VMADDR_FLAG_TO_HOST))
new_transport = transport_g2h;
else
new_transport = transport_h2g;
@@ -739,7 +756,7 @@ static struct sock *__vsock_create(struct net *net,
vsk->buffer_min_size = psk->buffer_min_size;
vsk->buffer_max_size = psk->buffer_max_size;
} else {
- vsk->trusted = capable(CAP_NET_ADMIN);
+ vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN);
vsk->owner = get_current_cred();
vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE;
@@ -926,10 +943,12 @@ static int vsock_shutdown(struct socket *sock, int mode)
*/
sk = sock->sk;
+
+ lock_sock(sk);
if (sock->state == SS_UNCONNECTED) {
err = -ENOTCONN;
if (sk->sk_type == SOCK_STREAM)
- return err;
+ goto out;
} else {
sock->state = SS_DISCONNECTING;
err = 0;
@@ -938,10 +957,8 @@ static int vsock_shutdown(struct socket *sock, int mode)
/* Receive and send shutdowns are treated alike. */
mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN);
if (mode) {
- lock_sock(sk);
sk->sk_shutdown |= mode;
sk->sk_state_change(sk);
- release_sock(sk);
if (sk->sk_type == SOCK_STREAM) {
sock_reset_flag(sk, SOCK_DONE);
@@ -949,6 +966,8 @@ static int vsock_shutdown(struct socket *sock, int mode)
}
}
+out:
+ release_sock(sk);
return err;
}
@@ -997,9 +1016,12 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
} else if (sock->type == SOCK_STREAM) {
- const struct vsock_transport *transport = vsk->transport;
+ const struct vsock_transport *transport;
+
lock_sock(sk);
+ transport = vsk->transport;
+
/* Listening sockets that have connections in their accept
* queue can be read.
*/
@@ -1082,10 +1104,11 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
err = 0;
sk = sock->sk;
vsk = vsock_sk(sk);
- transport = vsk->transport;
lock_sock(sk);
+ transport = vsk->transport;
+
err = vsock_auto_bind(vsk);
if (err)
goto out;
@@ -1212,7 +1235,7 @@ static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
{
const struct vsock_transport *transport = vsk->transport;
- if (!transport->cancel_pkt)
+ if (!transport || !transport->cancel_pkt)
return -EOPNOTSUPP;
return transport->cancel_pkt(vsk);
@@ -1222,7 +1245,6 @@ static void vsock_connect_timeout(struct work_struct *work)
{
struct sock *sk;
struct vsock_sock *vsk;
- int cancel = 0;
vsk = container_of(work, struct vsock_sock, connect_work.work);
sk = sk_vsock(vsk);
@@ -1233,11 +1255,9 @@ static void vsock_connect_timeout(struct work_struct *work)
sk->sk_state = TCP_CLOSE;
sk->sk_err = ETIMEDOUT;
sk->sk_error_report(sk);
- cancel = 1;
+ vsock_transport_cancel_pkt(vsk);
}
release_sock(sk);
- if (cancel)
- vsock_transport_cancel_pkt(vsk);
sock_put(sk);
}
@@ -1544,10 +1564,11 @@ static int vsock_stream_setsockopt(struct socket *sock,
err = 0;
sk = sock->sk;
vsk = vsock_sk(sk);
- transport = vsk->transport;
lock_sock(sk);
+ transport = vsk->transport;
+
switch (optname) {
case SO_VM_SOCKETS_BUFFER_SIZE:
COPY_IN(val);
@@ -1680,7 +1701,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
sk = sock->sk;
vsk = vsock_sk(sk);
- transport = vsk->transport;
total_written = 0;
err = 0;
@@ -1689,6 +1709,8 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
lock_sock(sk);
+ transport = vsk->transport;
+
/* Callers should not provide a destination with stream sockets. */
if (msg->msg_namelen) {
err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
@@ -1823,11 +1845,12 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
sk = sock->sk;
vsk = vsock_sk(sk);
- transport = vsk->transport;
err = 0;
lock_sock(sk);
+ transport = vsk->transport;
+
if (!transport || sk->sk_state != TCP_ESTABLISHED) {
/* Recvmsg is supposed to return 0 if a peer performs an
* orderly shutdown. Differentiate between that case and when a
@@ -2072,8 +2095,7 @@ static long vsock_dev_do_ioctl(struct file *filp,
break;
default:
- pr_err("Unknown ioctl %d\n", cmd);
- retval = -EINVAL;
+ retval = -ENOIOCTLCMD;
}
return retval;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 630b851f8150..cc3bae2659e7 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -474,14 +474,10 @@ static void hvs_shutdown_lock_held(struct hvsock *hvs, int mode)
static int hvs_shutdown(struct vsock_sock *vsk, int mode)
{
- struct sock *sk = sk_vsock(vsk);
-
if (!(mode & SEND_SHUTDOWN))
return 0;
- lock_sock(sk);
hvs_shutdown_lock_held(vsk->trans, mode);
- release_sock(sk);
return 0;
}
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 0edda1edf988..e4370b1b7494 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -841,8 +841,10 @@ void virtio_transport_release(struct vsock_sock *vsk)
virtio_transport_free_pkt(pkt);
}
- if (remove_sock)
+ if (remove_sock) {
+ sock_set_flag(sk, SOCK_DONE);
vsock_remove_sock(vsk);
+ }
}
EXPORT_SYMBOL_GPL(virtio_transport_release);
@@ -1128,18 +1130,18 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
vsk = vsock_sk(sk);
- space_available = virtio_transport_space_update(sk, pkt);
-
lock_sock(sk);
- /* Check if sk has been released before lock_sock */
- if (sk->sk_shutdown == SHUTDOWN_MASK) {
+ /* Check if sk has been closed before lock_sock */
+ if (sock_flag(sk, SOCK_DONE)) {
(void)virtio_transport_reset_no_sock(t, pkt);
release_sock(sk);
sock_put(sk);
goto free_pkt;
}
+ space_available = virtio_transport_space_update(sk, pkt);
+
/* Update CID in case it has changed after a transport reset event */
vsk->local_addr.svm_cid = dst.svm_cid;
diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c
index 909de26cb0e7..223b9660a759 100644
--- a/net/vmw_vsock/vsock_addr.c
+++ b/net/vmw_vsock/vsock_addr.c
@@ -22,13 +22,15 @@ EXPORT_SYMBOL_GPL(vsock_addr_init);
int vsock_addr_validate(const struct sockaddr_vm *addr)
{
+ __u8 svm_valid_flags = VMADDR_FLAG_TO_HOST;
+
if (!addr)
return -EFAULT;
if (addr->svm_family != AF_VSOCK)
return -EAFNOSUPPORT;
- if (addr->svm_zero[0] != 0)
+ if (addr->svm_flags & ~svm_valid_flags)
return -EINVAL;
return 0;
diff --git a/net/wimax/Kconfig b/net/wimax/Kconfig
deleted file mode 100644
index d13762bc4abc..000000000000
--- a/net/wimax/Kconfig
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# WiMAX LAN device configuration
-#
-
-menuconfig WIMAX
- tristate "WiMAX Wireless Broadband support"
- depends on RFKILL || !RFKILL
- help
-
- Select to configure support for devices that provide
- wireless broadband connectivity using the WiMAX protocol
- (IEEE 802.16).
-
- Please note that most of these devices require signing up
- for a service plan with a provider.
-
- The different WiMAX drivers can be enabled in the menu entry
-
- Device Drivers > Network device support > WiMAX Wireless
- Broadband devices
-
- If unsure, it is safe to select M (module).
-
-config WIMAX_DEBUG_LEVEL
- int "WiMAX debug level"
- depends on WIMAX
- default 8
- help
-
- Select the maximum debug verbosity level to be compiled into
- the WiMAX stack code.
-
- By default, debug messages are disabled at runtime and can
- be selectively enabled for different parts of the code using
- the sysfs debug-levels file.
-
- If set at zero, this will compile out all the debug code.
-
- It is recommended that it is left at 8.
diff --git a/net/wimax/Makefile b/net/wimax/Makefile
deleted file mode 100644
index c2a71ae487ac..000000000000
--- a/net/wimax/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-obj-$(CONFIG_WIMAX) += wimax.o
-
-wimax-y := \
- id-table.o \
- op-msg.o \
- op-reset.o \
- op-rfkill.o \
- op-state-get.o \
- stack.o
-
-wimax-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/net/wimax/debug-levels.h b/net/wimax/debug-levels.h
deleted file mode 100644
index ebc287cde336..000000000000
--- a/net/wimax/debug-levels.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Linux WiMAX Stack
- * Debug levels control file for the wimax module
- *
- * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- */
-#ifndef __debug_levels__h__
-#define __debug_levels__h__
-
-/* Maximum compile and run time debug level for all submodules */
-#define D_MODULENAME wimax
-#define D_MASTER CONFIG_WIMAX_DEBUG_LEVEL
-
-#include <linux/wimax/debug.h>
-
-/* List of all the enabled modules */
-enum d_module {
- D_SUBMODULE_DECLARE(debugfs),
- D_SUBMODULE_DECLARE(id_table),
- D_SUBMODULE_DECLARE(op_msg),
- D_SUBMODULE_DECLARE(op_reset),
- D_SUBMODULE_DECLARE(op_rfkill),
- D_SUBMODULE_DECLARE(op_state_get),
- D_SUBMODULE_DECLARE(stack),
-};
-
-#endif /* #ifndef __debug_levels__h__ */
diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c
deleted file mode 100644
index 3c54bb6b925a..000000000000
--- a/net/wimax/debugfs.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Debugfs support
- *
- * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- */
-#include <linux/debugfs.h>
-#include <linux/wimax.h>
-#include "wimax-internal.h"
-
-#define D_SUBMODULE debugfs
-#include "debug-levels.h"
-
-void wimax_debugfs_add(struct wimax_dev *wimax_dev)
-{
- struct net_device *net_dev = wimax_dev->net_dev;
- struct dentry *dentry;
- char buf[128];
-
- snprintf(buf, sizeof(buf), "wimax:%s", net_dev->name);
- dentry = debugfs_create_dir(buf, NULL);
- wimax_dev->debugfs_dentry = dentry;
-
- d_level_register_debugfs("wimax_dl_", debugfs, dentry);
- d_level_register_debugfs("wimax_dl_", id_table, dentry);
- d_level_register_debugfs("wimax_dl_", op_msg, dentry);
- d_level_register_debugfs("wimax_dl_", op_reset, dentry);
- d_level_register_debugfs("wimax_dl_", op_rfkill, dentry);
- d_level_register_debugfs("wimax_dl_", op_state_get, dentry);
- d_level_register_debugfs("wimax_dl_", stack, dentry);
-}
-
-void wimax_debugfs_rm(struct wimax_dev *wimax_dev)
-{
- debugfs_remove_recursive(wimax_dev->debugfs_dentry);
-}
diff --git a/net/wimax/id-table.c b/net/wimax/id-table.c
deleted file mode 100644
index 02eee37b7e31..000000000000
--- a/net/wimax/id-table.c
+++ /dev/null
@@ -1,130 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Mappping of generic netlink family IDs to net devices
- *
- * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * We assign a single generic netlink family ID to each device (to
- * simplify lookup).
- *
- * We need a way to map family ID to a wimax_dev pointer.
- *
- * The idea is to use a very simple lookup. Using a netlink attribute
- * with (for example) the interface name implies a heavier search over
- * all the network devices; seemed kind of a waste given that we know
- * we are looking for a WiMAX device and that most systems will have
- * just a single WiMAX adapter.
- *
- * We put all the WiMAX devices in the system in a linked list and
- * match the generic link family ID against the list.
- *
- * By using a linked list, the case of a single adapter in the system
- * becomes (almost) no overhead, while still working for many more. If
- * it ever goes beyond two, I'll be surprised.
- */
-#include <linux/device.h>
-#include <net/genetlink.h>
-#include <linux/netdevice.h>
-#include <linux/list.h>
-#include <linux/wimax.h>
-#include "wimax-internal.h"
-
-
-#define D_SUBMODULE id_table
-#include "debug-levels.h"
-
-
-static DEFINE_SPINLOCK(wimax_id_table_lock);
-static struct list_head wimax_id_table = LIST_HEAD_INIT(wimax_id_table);
-
-
-/*
- * wimax_id_table_add - add a gennetlink familiy ID / wimax_dev mapping
- *
- * @wimax_dev: WiMAX device descriptor to associate to the Generic
- * Netlink family ID.
- *
- * Look for an empty spot in the ID table; if none found, double the
- * table's size and get the first spot.
- */
-void wimax_id_table_add(struct wimax_dev *wimax_dev)
-{
- d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev);
- spin_lock(&wimax_id_table_lock);
- list_add(&wimax_dev->id_table_node, &wimax_id_table);
- spin_unlock(&wimax_id_table_lock);
- d_fnend(3, NULL, "(wimax_dev %p)\n", wimax_dev);
-}
-
-
-/*
- * wimax_get_netdev_by_info - lookup a wimax_dev from the gennetlink info
- *
- * The generic netlink family ID has been filled out in the
- * nlmsghdr->nlmsg_type field, so we pull it from there, look it up in
- * the mapping table and reference the wimax_dev.
- *
- * When done, the reference should be dropped with
- * 'dev_put(wimax_dev->net_dev)'.
- */
-struct wimax_dev *wimax_dev_get_by_genl_info(
- struct genl_info *info, int ifindex)
-{
- struct wimax_dev *wimax_dev = NULL;
-
- d_fnstart(3, NULL, "(info %p ifindex %d)\n", info, ifindex);
- spin_lock(&wimax_id_table_lock);
- list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
- if (wimax_dev->net_dev->ifindex == ifindex) {
- dev_hold(wimax_dev->net_dev);
- goto found;
- }
- }
- wimax_dev = NULL;
- d_printf(1, NULL, "wimax: no devices found with ifindex %d\n",
- ifindex);
-found:
- spin_unlock(&wimax_id_table_lock);
- d_fnend(3, NULL, "(info %p ifindex %d) = %p\n",
- info, ifindex, wimax_dev);
- return wimax_dev;
-}
-
-
-/*
- * wimax_id_table_rm - Remove a gennetlink familiy ID / wimax_dev mapping
- *
- * @id: family ID to remove from the table
- */
-void wimax_id_table_rm(struct wimax_dev *wimax_dev)
-{
- spin_lock(&wimax_id_table_lock);
- list_del_init(&wimax_dev->id_table_node);
- spin_unlock(&wimax_id_table_lock);
-}
-
-
-/*
- * Release the gennetlink family id / mapping table
- *
- * On debug, verify that the table is empty upon removal. We want the
- * code always compiled, to ensure it doesn't bit rot. It will be
- * compiled out if CONFIG_BUG is disabled.
- */
-void wimax_id_table_release(void)
-{
- struct wimax_dev *wimax_dev;
-
-#ifndef CONFIG_BUG
- return;
-#endif
- spin_lock(&wimax_id_table_lock);
- list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
- pr_err("BUG: %s wimax_dev %p ifindex %d not cleared\n",
- __func__, wimax_dev, wimax_dev->net_dev->ifindex);
- WARN_ON(1);
- }
- spin_unlock(&wimax_id_table_lock);
-}
diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c
deleted file mode 100644
index 6460b5785758..000000000000
--- a/net/wimax/op-msg.c
+++ /dev/null
@@ -1,391 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Generic messaging interface between userspace and driver/device
- *
- * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This implements a direct communication channel between user space and
- * the driver/device, by which free form messages can be sent back and
- * forth.
- *
- * This is intended for device-specific features, vendor quirks, etc.
- *
- * See include/net/wimax.h
- *
- * GENERIC NETLINK ENCODING AND CAPACITY
- *
- * A destination "pipe name" is added to each message; it is up to the
- * drivers to assign or use those names (if using them at all).
- *
- * Messages are encoded as a binary netlink attribute using nla_put()
- * using type NLA_UNSPEC (as some versions of libnl still in
- * deployment don't yet understand NLA_BINARY).
- *
- * The maximum capacity of this transport is PAGESIZE per message (so
- * the actual payload will be bit smaller depending on the
- * netlink/generic netlink attributes and headers).
- *
- * RECEPTION OF MESSAGES
- *
- * When a message is received from user space, it is passed verbatim
- * to the driver calling wimax_dev->op_msg_from_user(). The return
- * value from this function is passed back to user space as an ack
- * over the generic netlink protocol.
- *
- * The stack doesn't do any processing or interpretation of these
- * messages.
- *
- * SENDING MESSAGES
- *
- * Messages can be sent with wimax_msg().
- *
- * If the message delivery needs to happen on a different context to
- * that of its creation, wimax_msg_alloc() can be used to get a
- * pointer to the message that can be delivered later on with
- * wimax_msg_send().
- *
- * ROADMAP
- *
- * wimax_gnl_doit_msg_from_user() Process a message from user space
- * wimax_dev_get_by_genl_info()
- * wimax_dev->op_msg_from_user() Delivery of message to the driver
- *
- * wimax_msg() Send a message to user space
- * wimax_msg_alloc()
- * wimax_msg_send()
- */
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <net/genetlink.h>
-#include <linux/netdevice.h>
-#include <linux/wimax.h>
-#include <linux/security.h>
-#include <linux/export.h>
-#include "wimax-internal.h"
-
-
-#define D_SUBMODULE op_msg
-#include "debug-levels.h"
-
-
-/**
- * wimax_msg_alloc - Create a new skb for sending a message to userspace
- *
- * @wimax_dev: WiMAX device descriptor
- * @pipe_name: "named pipe" the message will be sent to
- * @msg: pointer to the message data to send
- * @size: size of the message to send (in bytes), including the header.
- * @gfp_flags: flags for memory allocation.
- *
- * Returns: %0 if ok, negative errno code on error
- *
- * Description:
- *
- * Allocates an skb that will contain the message to send to user
- * space over the messaging pipe and initializes it, copying the
- * payload.
- *
- * Once this call is done, you can deliver it with
- * wimax_msg_send().
- *
- * IMPORTANT:
- *
- * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as
- * wimax_msg_send() depends on skb->data being placed at the
- * beginning of the user message.
- *
- * Unlike other WiMAX stack calls, this call can be used way early,
- * even before wimax_dev_add() is called, as long as the
- * wimax_dev->net_dev pointer is set to point to a proper
- * net_dev. This is so that drivers can use it early in case they need
- * to send stuff around or communicate with user space.
- */
-struct sk_buff *wimax_msg_alloc(struct wimax_dev *wimax_dev,
- const char *pipe_name,
- const void *msg, size_t size,
- gfp_t gfp_flags)
-{
- int result;
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- size_t msg_size;
- void *genl_msg;
- struct sk_buff *skb;
-
- msg_size = nla_total_size(size)
- + nla_total_size(sizeof(u32))
- + (pipe_name ? nla_total_size(strlen(pipe_name)) : 0);
- result = -ENOMEM;
- skb = genlmsg_new(msg_size, gfp_flags);
- if (skb == NULL)
- goto error_new;
- genl_msg = genlmsg_put(skb, 0, 0, &wimax_gnl_family,
- 0, WIMAX_GNL_OP_MSG_TO_USER);
- if (genl_msg == NULL) {
- dev_err(dev, "no memory to create generic netlink message\n");
- goto error_genlmsg_put;
- }
- result = nla_put_u32(skb, WIMAX_GNL_MSG_IFIDX,
- wimax_dev->net_dev->ifindex);
- if (result < 0) {
- dev_err(dev, "no memory to add ifindex attribute\n");
- goto error_nla_put;
- }
- if (pipe_name) {
- result = nla_put_string(skb, WIMAX_GNL_MSG_PIPE_NAME,
- pipe_name);
- if (result < 0) {
- dev_err(dev, "no memory to add pipe_name attribute\n");
- goto error_nla_put;
- }
- }
- result = nla_put(skb, WIMAX_GNL_MSG_DATA, size, msg);
- if (result < 0) {
- dev_err(dev, "no memory to add payload (msg %p size %zu) in "
- "attribute: %d\n", msg, size, result);
- goto error_nla_put;
- }
- genlmsg_end(skb, genl_msg);
- return skb;
-
-error_nla_put:
-error_genlmsg_put:
-error_new:
- nlmsg_free(skb);
- return ERR_PTR(result);
-}
-EXPORT_SYMBOL_GPL(wimax_msg_alloc);
-
-
-/**
- * wimax_msg_data_len - Return a pointer and size of a message's payload
- *
- * @msg: Pointer to a message created with wimax_msg_alloc()
- * @size: Pointer to where to store the message's size
- *
- * Returns the pointer to the message data.
- */
-const void *wimax_msg_data_len(struct sk_buff *msg, size_t *size)
-{
- struct nlmsghdr *nlh = (void *) msg->head;
- struct nlattr *nla;
-
- nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
- WIMAX_GNL_MSG_DATA);
- if (nla == NULL) {
- pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
- return NULL;
- }
- *size = nla_len(nla);
- return nla_data(nla);
-}
-EXPORT_SYMBOL_GPL(wimax_msg_data_len);
-
-
-/**
- * wimax_msg_data - Return a pointer to a message's payload
- *
- * @msg: Pointer to a message created with wimax_msg_alloc()
- */
-const void *wimax_msg_data(struct sk_buff *msg)
-{
- struct nlmsghdr *nlh = (void *) msg->head;
- struct nlattr *nla;
-
- nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
- WIMAX_GNL_MSG_DATA);
- if (nla == NULL) {
- pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
- return NULL;
- }
- return nla_data(nla);
-}
-EXPORT_SYMBOL_GPL(wimax_msg_data);
-
-
-/**
- * wimax_msg_len - Return a message's payload length
- *
- * @msg: Pointer to a message created with wimax_msg_alloc()
- */
-ssize_t wimax_msg_len(struct sk_buff *msg)
-{
- struct nlmsghdr *nlh = (void *) msg->head;
- struct nlattr *nla;
-
- nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
- WIMAX_GNL_MSG_DATA);
- if (nla == NULL) {
- pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
- return -EINVAL;
- }
- return nla_len(nla);
-}
-EXPORT_SYMBOL_GPL(wimax_msg_len);
-
-
-/**
- * wimax_msg_send - Send a pre-allocated message to user space
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * @skb: &struct sk_buff returned by wimax_msg_alloc(). Note the
- * ownership of @skb is transferred to this function.
- *
- * Returns: 0 if ok, < 0 errno code on error
- *
- * Description:
- *
- * Sends a free-form message that was preallocated with
- * wimax_msg_alloc() and filled up.
- *
- * Assumes that once you pass an skb to this function for sending, it
- * owns it and will release it when done (on success).
- *
- * IMPORTANT:
- *
- * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as
- * wimax_msg_send() depends on skb->data being placed at the
- * beginning of the user message.
- *
- * Unlike other WiMAX stack calls, this call can be used way early,
- * even before wimax_dev_add() is called, as long as the
- * wimax_dev->net_dev pointer is set to point to a proper
- * net_dev. This is so that drivers can use it early in case they need
- * to send stuff around or communicate with user space.
- */
-int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb)
-{
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- void *msg = skb->data;
- size_t size = skb->len;
- might_sleep();
-
- d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size);
- d_dump(2, dev, msg, size);
- genlmsg_multicast(&wimax_gnl_family, skb, 0, 0, GFP_KERNEL);
- d_printf(1, dev, "CTX: genl multicast done\n");
- return 0;
-}
-EXPORT_SYMBOL_GPL(wimax_msg_send);
-
-
-/**
- * wimax_msg - Send a message to user space
- *
- * @wimax_dev: WiMAX device descriptor (properly referenced)
- * @pipe_name: "named pipe" the message will be sent to
- * @buf: pointer to the message to send.
- * @size: size of the buffer pointed to by @buf (in bytes).
- * @gfp_flags: flags for memory allocation.
- *
- * Returns: %0 if ok, negative errno code on error.
- *
- * Description:
- *
- * Sends a free-form message to user space on the device @wimax_dev.
- *
- * NOTES:
- *
- * Once the @skb is given to this function, who will own it and will
- * release it when done (unless it returns error).
- */
-int wimax_msg(struct wimax_dev *wimax_dev, const char *pipe_name,
- const void *buf, size_t size, gfp_t gfp_flags)
-{
- int result = -ENOMEM;
- struct sk_buff *skb;
-
- skb = wimax_msg_alloc(wimax_dev, pipe_name, buf, size, gfp_flags);
- if (IS_ERR(skb))
- result = PTR_ERR(skb);
- else
- result = wimax_msg_send(wimax_dev, skb);
- return result;
-}
-EXPORT_SYMBOL_GPL(wimax_msg);
-
-/*
- * Relays a message from user space to the driver
- *
- * The skb is passed to the driver-specific function with the netlink
- * and generic netlink headers already stripped.
- *
- * This call will block while handling/relaying the message.
- */
-int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info)
-{
- int result, ifindex;
- struct wimax_dev *wimax_dev;
- struct device *dev;
- struct nlmsghdr *nlh = info->nlhdr;
- char *pipe_name;
- void *msg_buf;
- size_t msg_len;
-
- might_sleep();
- d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
- result = -ENODEV;
- if (info->attrs[WIMAX_GNL_MSG_IFIDX] == NULL) {
- pr_err("WIMAX_GNL_MSG_FROM_USER: can't find IFIDX attribute\n");
- goto error_no_wimax_dev;
- }
- ifindex = nla_get_u32(info->attrs[WIMAX_GNL_MSG_IFIDX]);
- wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
- if (wimax_dev == NULL)
- goto error_no_wimax_dev;
- dev = wimax_dev_to_dev(wimax_dev);
-
- /* Unpack arguments */
- result = -EINVAL;
- if (info->attrs[WIMAX_GNL_MSG_DATA] == NULL) {
- dev_err(dev, "WIMAX_GNL_MSG_FROM_USER: can't find MSG_DATA "
- "attribute\n");
- goto error_no_data;
- }
- msg_buf = nla_data(info->attrs[WIMAX_GNL_MSG_DATA]);
- msg_len = nla_len(info->attrs[WIMAX_GNL_MSG_DATA]);
-
- if (info->attrs[WIMAX_GNL_MSG_PIPE_NAME] == NULL)
- pipe_name = NULL;
- else {
- struct nlattr *attr = info->attrs[WIMAX_GNL_MSG_PIPE_NAME];
- size_t attr_len = nla_len(attr);
- /* libnl-1.1 does not yet support NLA_NUL_STRING */
- result = -ENOMEM;
- pipe_name = kstrndup(nla_data(attr), attr_len + 1, GFP_KERNEL);
- if (pipe_name == NULL)
- goto error_alloc;
- pipe_name[attr_len] = 0;
- }
- mutex_lock(&wimax_dev->mutex);
- result = wimax_dev_is_ready(wimax_dev);
- if (result == -ENOMEDIUM)
- result = 0;
- if (result < 0)
- goto error_not_ready;
- result = -ENOSYS;
- if (wimax_dev->op_msg_from_user == NULL)
- goto error_noop;
-
- d_printf(1, dev,
- "CRX: nlmsghdr len %u type %u flags 0x%04x seq 0x%x pid %u\n",
- nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags,
- nlh->nlmsg_seq, nlh->nlmsg_pid);
- d_printf(1, dev, "CRX: wimax message %zu bytes\n", msg_len);
- d_dump(2, dev, msg_buf, msg_len);
-
- result = wimax_dev->op_msg_from_user(wimax_dev, pipe_name,
- msg_buf, msg_len, info);
-error_noop:
-error_not_ready:
- mutex_unlock(&wimax_dev->mutex);
-error_alloc:
- kfree(pipe_name);
-error_no_data:
- dev_put(wimax_dev->net_dev);
-error_no_wimax_dev:
- d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
- return result;
-}
diff --git a/net/wimax/op-reset.c b/net/wimax/op-reset.c
deleted file mode 100644
index 9899b2e56721..000000000000
--- a/net/wimax/op-reset.c
+++ /dev/null
@@ -1,108 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Implement and export a method for resetting a WiMAX device
- *
- * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This implements a simple synchronous call to reset a WiMAX device.
- *
- * Resets aim at being warm, keeping the device handles active;
- * however, when that fails, it falls back to a cold reset (that will
- * disconnect and reconnect the device).
- */
-
-#include <net/wimax.h>
-#include <net/genetlink.h>
-#include <linux/wimax.h>
-#include <linux/security.h>
-#include <linux/export.h>
-#include "wimax-internal.h"
-
-#define D_SUBMODULE op_reset
-#include "debug-levels.h"
-
-
-/**
- * wimax_reset - Reset a WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * Returns:
- *
- * %0 if ok and a warm reset was done (the device still exists in
- * the system).
- *
- * -%ENODEV if a cold/bus reset had to be done (device has
- * disconnected and reconnected, so current handle is not valid
- * any more).
- *
- * -%EINVAL if the device is not even registered.
- *
- * Any other negative error code shall be considered as
- * non-recoverable.
- *
- * Description:
- *
- * Called when wanting to reset the device for any reason. Device is
- * taken back to power on status.
- *
- * This call blocks; on successful return, the device has completed the
- * reset process and is ready to operate.
- */
-int wimax_reset(struct wimax_dev *wimax_dev)
-{
- int result = -EINVAL;
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- enum wimax_st state;
-
- might_sleep();
- d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
- mutex_lock(&wimax_dev->mutex);
- dev_hold(wimax_dev->net_dev);
- state = wimax_dev->state;
- mutex_unlock(&wimax_dev->mutex);
-
- if (state >= WIMAX_ST_DOWN) {
- mutex_lock(&wimax_dev->mutex_reset);
- result = wimax_dev->op_reset(wimax_dev);
- mutex_unlock(&wimax_dev->mutex_reset);
- }
- dev_put(wimax_dev->net_dev);
-
- d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
- return result;
-}
-EXPORT_SYMBOL(wimax_reset);
-
-
-/*
- * Exporting to user space over generic netlink
- *
- * Parse the reset command from user space, return error code.
- *
- * No attributes.
- */
-int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info)
-{
- int result, ifindex;
- struct wimax_dev *wimax_dev;
-
- d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
- result = -ENODEV;
- if (info->attrs[WIMAX_GNL_RESET_IFIDX] == NULL) {
- pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n");
- goto error_no_wimax_dev;
- }
- ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RESET_IFIDX]);
- wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
- if (wimax_dev == NULL)
- goto error_no_wimax_dev;
- /* Execute the operation and send the result back to user space */
- result = wimax_reset(wimax_dev);
- dev_put(wimax_dev->net_dev);
-error_no_wimax_dev:
- d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
- return result;
-}
diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c
deleted file mode 100644
index 248d10b60b05..000000000000
--- a/net/wimax/op-rfkill.c
+++ /dev/null
@@ -1,431 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * RF-kill framework integration
- *
- * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This integrates into the Linux Kernel rfkill susbystem so that the
- * drivers just have to do the bare minimal work, which is providing a
- * method to set the software RF-Kill switch and to report changes in
- * the software and hardware switch status.
- *
- * A non-polled generic rfkill device is embedded into the WiMAX
- * subsystem's representation of a device.
- *
- * FIXME: Need polled support? Let drivers provide a poll routine
- * and hand it to rfkill ops then?
- *
- * All device drivers have to do is after wimax_dev_init(), call
- * wimax_report_rfkill_hw() and wimax_report_rfkill_sw() to update
- * initial state and then every time it changes. See wimax.h:struct
- * wimax_dev for more information.
- *
- * ROADMAP
- *
- * wimax_gnl_doit_rfkill() User space calling wimax_rfkill()
- * wimax_rfkill() Kernel calling wimax_rfkill()
- * __wimax_rf_toggle_radio()
- *
- * wimax_rfkill_set_radio_block() RF-Kill subsystem calling
- * __wimax_rf_toggle_radio()
- *
- * __wimax_rf_toggle_radio()
- * wimax_dev->op_rfkill_sw_toggle() Driver backend
- * __wimax_state_change()
- *
- * wimax_report_rfkill_sw() Driver reports state change
- * __wimax_state_change()
- *
- * wimax_report_rfkill_hw() Driver reports state change
- * __wimax_state_change()
- *
- * wimax_rfkill_add() Initialize/shutdown rfkill support
- * wimax_rfkill_rm() [called by wimax_dev_add/rm()]
- */
-
-#include <net/wimax.h>
-#include <net/genetlink.h>
-#include <linux/wimax.h>
-#include <linux/security.h>
-#include <linux/rfkill.h>
-#include <linux/export.h>
-#include "wimax-internal.h"
-
-#define D_SUBMODULE op_rfkill
-#include "debug-levels.h"
-
-/**
- * wimax_report_rfkill_hw - Reports changes in the hardware RF switch
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * @state: New state of the RF Kill switch. %WIMAX_RF_ON radio on,
- * %WIMAX_RF_OFF radio off.
- *
- * When the device detects a change in the state of thehardware RF
- * switch, it must call this function to let the WiMAX kernel stack
- * know that the state has changed so it can be properly propagated.
- *
- * The WiMAX stack caches the state (the driver doesn't need to). As
- * well, as the change is propagated it will come back as a request to
- * change the software state to mirror the hardware state.
- *
- * If the device doesn't have a hardware kill switch, just report
- * it on initialization as always on (%WIMAX_RF_ON, radio on).
- */
-void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev,
- enum wimax_rf_state state)
-{
- int result;
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- enum wimax_st wimax_state;
-
- d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
- BUG_ON(state == WIMAX_RF_QUERY);
- BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF);
-
- mutex_lock(&wimax_dev->mutex);
- result = wimax_dev_is_ready(wimax_dev);
- if (result < 0)
- goto error_not_ready;
-
- if (state != wimax_dev->rf_hw) {
- wimax_dev->rf_hw = state;
- if (wimax_dev->rf_hw == WIMAX_RF_ON &&
- wimax_dev->rf_sw == WIMAX_RF_ON)
- wimax_state = WIMAX_ST_READY;
- else
- wimax_state = WIMAX_ST_RADIO_OFF;
-
- result = rfkill_set_hw_state(wimax_dev->rfkill,
- state == WIMAX_RF_OFF);
-
- __wimax_state_change(wimax_dev, wimax_state);
- }
-error_not_ready:
- mutex_unlock(&wimax_dev->mutex);
- d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n",
- wimax_dev, state, result);
-}
-EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw);
-
-
-/**
- * wimax_report_rfkill_sw - Reports changes in the software RF switch
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * @state: New state of the RF kill switch. %WIMAX_RF_ON radio on,
- * %WIMAX_RF_OFF radio off.
- *
- * Reports changes in the software RF switch state to the WiMAX stack.
- *
- * The main use is during initialization, so the driver can query the
- * device for its current software radio kill switch state and feed it
- * to the system.
- *
- * On the side, the device does not change the software state by
- * itself. In practice, this can happen, as the device might decide to
- * switch (in software) the radio off for different reasons.
- */
-void wimax_report_rfkill_sw(struct wimax_dev *wimax_dev,
- enum wimax_rf_state state)
-{
- int result;
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- enum wimax_st wimax_state;
-
- d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
- BUG_ON(state == WIMAX_RF_QUERY);
- BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF);
-
- mutex_lock(&wimax_dev->mutex);
- result = wimax_dev_is_ready(wimax_dev);
- if (result < 0)
- goto error_not_ready;
-
- if (state != wimax_dev->rf_sw) {
- wimax_dev->rf_sw = state;
- if (wimax_dev->rf_hw == WIMAX_RF_ON &&
- wimax_dev->rf_sw == WIMAX_RF_ON)
- wimax_state = WIMAX_ST_READY;
- else
- wimax_state = WIMAX_ST_RADIO_OFF;
- __wimax_state_change(wimax_dev, wimax_state);
- rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
- }
-error_not_ready:
- mutex_unlock(&wimax_dev->mutex);
- d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n",
- wimax_dev, state, result);
-}
-EXPORT_SYMBOL_GPL(wimax_report_rfkill_sw);
-
-
-/*
- * Callback for the RF Kill toggle operation
- *
- * This function is called by:
- *
- * - The rfkill subsystem when the RF-Kill key is pressed in the
- * hardware and the driver notifies through
- * wimax_report_rfkill_hw(). The rfkill subsystem ends up calling back
- * here so the software RF Kill switch state is changed to reflect
- * the hardware switch state.
- *
- * - When the user sets the state through sysfs' rfkill/state file
- *
- * - When the user calls wimax_rfkill().
- *
- * This call blocks!
- *
- * WARNING! When we call rfkill_unregister(), this will be called with
- * state 0!
- *
- * WARNING: wimax_dev must be locked
- */
-static
-int __wimax_rf_toggle_radio(struct wimax_dev *wimax_dev,
- enum wimax_rf_state state)
-{
- int result = 0;
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- enum wimax_st wimax_state;
-
- might_sleep();
- d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
- if (wimax_dev->rf_sw == state)
- goto out_no_change;
- if (wimax_dev->op_rfkill_sw_toggle != NULL)
- result = wimax_dev->op_rfkill_sw_toggle(wimax_dev, state);
- else if (state == WIMAX_RF_OFF) /* No op? can't turn off */
- result = -ENXIO;
- else /* No op? can turn on */
- result = 0; /* should never happen tho */
- if (result >= 0) {
- result = 0;
- wimax_dev->rf_sw = state;
- wimax_state = state == WIMAX_RF_ON ?
- WIMAX_ST_READY : WIMAX_ST_RADIO_OFF;
- __wimax_state_change(wimax_dev, wimax_state);
- }
-out_no_change:
- d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
- wimax_dev, state, result);
- return result;
-}
-
-
-/*
- * Translate from rfkill state to wimax state
- *
- * NOTE: Special state handling rules here
- *
- * Just pretend the call didn't happen if we are in a state where
- * we know for sure it cannot be handled (WIMAX_ST_DOWN or
- * __WIMAX_ST_QUIESCING). rfkill() needs it to register and
- * unregister, as it will run this path.
- *
- * NOTE: This call will block until the operation is completed.
- */
-static int wimax_rfkill_set_radio_block(void *data, bool blocked)
-{
- int result;
- struct wimax_dev *wimax_dev = data;
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- enum wimax_rf_state rf_state;
-
- d_fnstart(3, dev, "(wimax_dev %p blocked %u)\n", wimax_dev, blocked);
- rf_state = WIMAX_RF_ON;
- if (blocked)
- rf_state = WIMAX_RF_OFF;
- mutex_lock(&wimax_dev->mutex);
- if (wimax_dev->state <= __WIMAX_ST_QUIESCING)
- result = 0;
- else
- result = __wimax_rf_toggle_radio(wimax_dev, rf_state);
- mutex_unlock(&wimax_dev->mutex);
- d_fnend(3, dev, "(wimax_dev %p blocked %u) = %d\n",
- wimax_dev, blocked, result);
- return result;
-}
-
-static const struct rfkill_ops wimax_rfkill_ops = {
- .set_block = wimax_rfkill_set_radio_block,
-};
-
-/**
- * wimax_rfkill - Set the software RF switch state for a WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * @state: New RF state.
- *
- * Returns:
- *
- * >= 0 toggle state if ok, < 0 errno code on error. The toggle state
- * is returned as a bitmap, bit 0 being the hardware RF state, bit 1
- * the software RF state.
- *
- * 0 means disabled (%WIMAX_RF_ON, radio on), 1 means enabled radio
- * off (%WIMAX_RF_OFF).
- *
- * Description:
- *
- * Called by the user when he wants to request the WiMAX radio to be
- * switched on (%WIMAX_RF_ON) or off (%WIMAX_RF_OFF). With
- * %WIMAX_RF_QUERY, just the current state is returned.
- *
- * NOTE:
- *
- * This call will block until the operation is complete.
- */
-int wimax_rfkill(struct wimax_dev *wimax_dev, enum wimax_rf_state state)
-{
- int result;
- struct device *dev = wimax_dev_to_dev(wimax_dev);
-
- d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
- mutex_lock(&wimax_dev->mutex);
- result = wimax_dev_is_ready(wimax_dev);
- if (result < 0) {
- /* While initializing, < 1.4.3 wimax-tools versions use
- * this call to check if the device is a valid WiMAX
- * device; so we allow it to proceed always,
- * considering the radios are all off. */
- if (result == -ENOMEDIUM && state == WIMAX_RF_QUERY)
- result = WIMAX_RF_OFF << 1 | WIMAX_RF_OFF;
- goto error_not_ready;
- }
- switch (state) {
- case WIMAX_RF_ON:
- case WIMAX_RF_OFF:
- result = __wimax_rf_toggle_radio(wimax_dev, state);
- if (result < 0)
- goto error;
- rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
- break;
- case WIMAX_RF_QUERY:
- break;
- default:
- result = -EINVAL;
- goto error;
- }
- result = wimax_dev->rf_sw << 1 | wimax_dev->rf_hw;
-error:
-error_not_ready:
- mutex_unlock(&wimax_dev->mutex);
- d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
- wimax_dev, state, result);
- return result;
-}
-EXPORT_SYMBOL(wimax_rfkill);
-
-
-/*
- * Register a new WiMAX device's RF Kill support
- *
- * WARNING: wimax_dev->mutex must be unlocked
- */
-int wimax_rfkill_add(struct wimax_dev *wimax_dev)
-{
- int result;
- struct rfkill *rfkill;
- struct device *dev = wimax_dev_to_dev(wimax_dev);
-
- d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
- /* Initialize RF Kill */
- result = -ENOMEM;
- rfkill = rfkill_alloc(wimax_dev->name, dev, RFKILL_TYPE_WIMAX,
- &wimax_rfkill_ops, wimax_dev);
- if (rfkill == NULL)
- goto error_rfkill_allocate;
-
- d_printf(1, dev, "rfkill %p\n", rfkill);
-
- wimax_dev->rfkill = rfkill;
-
- rfkill_init_sw_state(rfkill, 1);
- result = rfkill_register(wimax_dev->rfkill);
- if (result < 0)
- goto error_rfkill_register;
-
- /* If there is no SW toggle op, SW RFKill is always on */
- if (wimax_dev->op_rfkill_sw_toggle == NULL)
- wimax_dev->rf_sw = WIMAX_RF_ON;
-
- d_fnend(3, dev, "(wimax_dev %p) = 0\n", wimax_dev);
- return 0;
-
-error_rfkill_register:
- rfkill_destroy(wimax_dev->rfkill);
-error_rfkill_allocate:
- d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
- return result;
-}
-
-
-/*
- * Deregister a WiMAX device's RF Kill support
- *
- * Ick, we can't call rfkill_free() after rfkill_unregister()...oh
- * well.
- *
- * WARNING: wimax_dev->mutex must be unlocked
- */
-void wimax_rfkill_rm(struct wimax_dev *wimax_dev)
-{
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
- rfkill_unregister(wimax_dev->rfkill);
- rfkill_destroy(wimax_dev->rfkill);
- d_fnend(3, dev, "(wimax_dev %p)\n", wimax_dev);
-}
-
-
-/*
- * Exporting to user space over generic netlink
- *
- * Parse the rfkill command from user space, return a combination
- * value that describe the states of the different toggles.
- *
- * Only one attribute: the new state requested (on, off or no change,
- * just query).
- */
-
-int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info)
-{
- int result, ifindex;
- struct wimax_dev *wimax_dev;
- struct device *dev;
- enum wimax_rf_state new_state;
-
- d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
- result = -ENODEV;
- if (info->attrs[WIMAX_GNL_RFKILL_IFIDX] == NULL) {
- pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n");
- goto error_no_wimax_dev;
- }
- ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_IFIDX]);
- wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
- if (wimax_dev == NULL)
- goto error_no_wimax_dev;
- dev = wimax_dev_to_dev(wimax_dev);
- result = -EINVAL;
- if (info->attrs[WIMAX_GNL_RFKILL_STATE] == NULL) {
- dev_err(dev, "WIMAX_GNL_RFKILL: can't find RFKILL_STATE "
- "attribute\n");
- goto error_no_pid;
- }
- new_state = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_STATE]);
-
- /* Execute the operation and send the result back to user space */
- result = wimax_rfkill(wimax_dev, new_state);
-error_no_pid:
- dev_put(wimax_dev->net_dev);
-error_no_wimax_dev:
- d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
- return result;
-}
diff --git a/net/wimax/op-state-get.c b/net/wimax/op-state-get.c
deleted file mode 100644
index 5bc712de1563..000000000000
--- a/net/wimax/op-state-get.c
+++ /dev/null
@@ -1,52 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Implement and export a method for getting a WiMAX device current state
- *
- * Copyright (C) 2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
- *
- * Based on previous WiMAX core work by:
- * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- */
-
-#include <net/wimax.h>
-#include <net/genetlink.h>
-#include <linux/wimax.h>
-#include <linux/security.h>
-#include "wimax-internal.h"
-
-#define D_SUBMODULE op_state_get
-#include "debug-levels.h"
-
-
-/*
- * Exporting to user space over generic netlink
- *
- * Parse the state get command from user space, return a combination
- * value that describe the current state.
- *
- * No attributes.
- */
-int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info)
-{
- int result, ifindex;
- struct wimax_dev *wimax_dev;
-
- d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
- result = -ENODEV;
- if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) {
- pr_err("WIMAX_GNL_OP_STATE_GET: can't find IFIDX attribute\n");
- goto error_no_wimax_dev;
- }
- ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]);
- wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
- if (wimax_dev == NULL)
- goto error_no_wimax_dev;
- /* Execute the operation and send the result back to user space */
- result = wimax_state_get(wimax_dev);
- dev_put(wimax_dev->net_dev);
-error_no_wimax_dev:
- d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
- return result;
-}
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
deleted file mode 100644
index 4b9b1c5e8f3a..000000000000
--- a/net/wimax/stack.c
+++ /dev/null
@@ -1,609 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Initialization, addition and removal of wimax devices
- *
- * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This implements:
- *
- * - basic life cycle of 'struct wimax_dev' [wimax_dev_*()]; on
- * addition/registration initialize all subfields and allocate
- * generic netlink resources for user space communication. On
- * removal/unregistration, undo all that.
- *
- * - device state machine [wimax_state_change()] and support to send
- * reports to user space when the state changes
- * [wimax_gnl_re_state_change*()].
- *
- * See include/net/wimax.h for rationales and design.
- *
- * ROADMAP
- *
- * [__]wimax_state_change() Called by drivers to update device's state
- * wimax_gnl_re_state_change_alloc()
- * wimax_gnl_re_state_change_send()
- *
- * wimax_dev_init() Init a device
- * wimax_dev_add() Register
- * wimax_rfkill_add()
- * wimax_gnl_add() Register all the generic netlink resources.
- * wimax_id_table_add()
- * wimax_dev_rm() Unregister
- * wimax_id_table_rm()
- * wimax_gnl_rm()
- * wimax_rfkill_rm()
- */
-#include <linux/device.h>
-#include <linux/gfp.h>
-#include <net/genetlink.h>
-#include <linux/netdevice.h>
-#include <linux/wimax.h>
-#include <linux/module.h>
-#include "wimax-internal.h"
-
-
-#define D_SUBMODULE stack
-#include "debug-levels.h"
-
-static char wimax_debug_params[128];
-module_param_string(debug, wimax_debug_params, sizeof(wimax_debug_params),
- 0644);
-MODULE_PARM_DESC(debug,
- "String of space-separated NAME:VALUE pairs, where NAMEs "
- "are the different debug submodules and VALUE are the "
- "initial debug value to set.");
-
-/*
- * Authoritative source for the RE_STATE_CHANGE attribute policy
- *
- * We don't really use it here, but /me likes to keep the definition
- * close to where the data is generated.
- */
-/*
-static const struct nla_policy wimax_gnl_re_status_change[WIMAX_GNL_ATTR_MAX + 1] = {
- [WIMAX_GNL_STCH_STATE_OLD] = { .type = NLA_U8 },
- [WIMAX_GNL_STCH_STATE_NEW] = { .type = NLA_U8 },
-};
-*/
-
-
-/*
- * Allocate a Report State Change message
- *
- * @header: save it, you need it for _send()
- *
- * Creates and fills a basic state change message; different code
- * paths can then add more attributes to the message as needed.
- *
- * Use wimax_gnl_re_state_change_send() to send the returned skb.
- *
- * Returns: skb with the genl message if ok, IS_ERR() ptr on error
- * with an errno code.
- */
-static
-struct sk_buff *wimax_gnl_re_state_change_alloc(
- struct wimax_dev *wimax_dev,
- enum wimax_st new_state, enum wimax_st old_state,
- void **header)
-{
- int result;
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- void *data;
- struct sk_buff *report_skb;
-
- d_fnstart(3, dev, "(wimax_dev %p new_state %u old_state %u)\n",
- wimax_dev, new_state, old_state);
- result = -ENOMEM;
- report_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (report_skb == NULL) {
- dev_err(dev, "RE_STCH: can't create message\n");
- goto error_new;
- }
- /* FIXME: sending a group ID as the seq is wrong */
- data = genlmsg_put(report_skb, 0, wimax_gnl_family.mcgrp_offset,
- &wimax_gnl_family, 0, WIMAX_GNL_RE_STATE_CHANGE);
- if (data == NULL) {
- dev_err(dev, "RE_STCH: can't put data into message\n");
- goto error_put;
- }
- *header = data;
-
- result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_OLD, old_state);
- if (result < 0) {
- dev_err(dev, "RE_STCH: Error adding OLD attr: %d\n", result);
- goto error_put;
- }
- result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_NEW, new_state);
- if (result < 0) {
- dev_err(dev, "RE_STCH: Error adding NEW attr: %d\n", result);
- goto error_put;
- }
- result = nla_put_u32(report_skb, WIMAX_GNL_STCH_IFIDX,
- wimax_dev->net_dev->ifindex);
- if (result < 0) {
- dev_err(dev, "RE_STCH: Error adding IFINDEX attribute\n");
- goto error_put;
- }
- d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %p\n",
- wimax_dev, new_state, old_state, report_skb);
- return report_skb;
-
-error_put:
- nlmsg_free(report_skb);
-error_new:
- d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %d\n",
- wimax_dev, new_state, old_state, result);
- return ERR_PTR(result);
-}
-
-
-/*
- * Send a Report State Change message (as created with _alloc).
- *
- * @report_skb: as returned by wimax_gnl_re_state_change_alloc()
- * @header: as returned by wimax_gnl_re_state_change_alloc()
- *
- * Returns: 0 if ok, < 0 errno code on error.
- *
- * If the message is NULL, pretend it didn't happen.
- */
-static
-int wimax_gnl_re_state_change_send(
- struct wimax_dev *wimax_dev, struct sk_buff *report_skb,
- void *header)
-{
- int result = 0;
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- d_fnstart(3, dev, "(wimax_dev %p report_skb %p)\n",
- wimax_dev, report_skb);
- if (report_skb == NULL) {
- result = -ENOMEM;
- goto out;
- }
- genlmsg_end(report_skb, header);
- genlmsg_multicast(&wimax_gnl_family, report_skb, 0, 0, GFP_KERNEL);
-out:
- d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n",
- wimax_dev, report_skb, result);
- return result;
-}
-
-
-static
-void __check_new_state(enum wimax_st old_state, enum wimax_st new_state,
- unsigned int allowed_states_bm)
-{
- if (WARN_ON(((1 << new_state) & allowed_states_bm) == 0)) {
- pr_err("SW BUG! Forbidden state change %u -> %u\n",
- old_state, new_state);
- }
-}
-
-
-/*
- * Set the current state of a WiMAX device [unlocking version of
- * wimax_state_change().
- */
-void __wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
-{
- struct device *dev = wimax_dev_to_dev(wimax_dev);
- enum wimax_st old_state = wimax_dev->state;
- struct sk_buff *stch_skb;
- void *header;
-
- d_fnstart(3, dev, "(wimax_dev %p new_state %u [old %u])\n",
- wimax_dev, new_state, old_state);
-
- if (WARN_ON(new_state >= __WIMAX_ST_INVALID)) {
- dev_err(dev, "SW BUG: requesting invalid state %u\n",
- new_state);
- goto out;
- }
- if (old_state == new_state)
- goto out;
- header = NULL; /* gcc complains? can't grok why */
- stch_skb = wimax_gnl_re_state_change_alloc(
- wimax_dev, new_state, old_state, &header);
-
- /* Verify the state transition and do exit-from-state actions */
- switch (old_state) {
- case __WIMAX_ST_NULL:
- __check_new_state(old_state, new_state,
- 1 << WIMAX_ST_DOWN);
- break;
- case WIMAX_ST_DOWN:
- __check_new_state(old_state, new_state,
- 1 << __WIMAX_ST_QUIESCING
- | 1 << WIMAX_ST_UNINITIALIZED
- | 1 << WIMAX_ST_RADIO_OFF);
- break;
- case __WIMAX_ST_QUIESCING:
- __check_new_state(old_state, new_state, 1 << WIMAX_ST_DOWN);
- break;
- case WIMAX_ST_UNINITIALIZED:
- __check_new_state(old_state, new_state,
- 1 << __WIMAX_ST_QUIESCING
- | 1 << WIMAX_ST_RADIO_OFF);
- break;
- case WIMAX_ST_RADIO_OFF:
- __check_new_state(old_state, new_state,
- 1 << __WIMAX_ST_QUIESCING
- | 1 << WIMAX_ST_READY);
- break;
- case WIMAX_ST_READY:
- __check_new_state(old_state, new_state,
- 1 << __WIMAX_ST_QUIESCING
- | 1 << WIMAX_ST_RADIO_OFF
- | 1 << WIMAX_ST_SCANNING
- | 1 << WIMAX_ST_CONNECTING
- | 1 << WIMAX_ST_CONNECTED);
- break;
- case WIMAX_ST_SCANNING:
- __check_new_state(old_state, new_state,
- 1 << __WIMAX_ST_QUIESCING
- | 1 << WIMAX_ST_RADIO_OFF
- | 1 << WIMAX_ST_READY
- | 1 << WIMAX_ST_CONNECTING
- | 1 << WIMAX_ST_CONNECTED);
- break;
- case WIMAX_ST_CONNECTING:
- __check_new_state(old_state, new_state,
- 1 << __WIMAX_ST_QUIESCING
- | 1 << WIMAX_ST_RADIO_OFF
- | 1 << WIMAX_ST_READY
- | 1 << WIMAX_ST_SCANNING
- | 1 << WIMAX_ST_CONNECTED);
- break;
- case WIMAX_ST_CONNECTED:
- __check_new_state(old_state, new_state,
- 1 << __WIMAX_ST_QUIESCING
- | 1 << WIMAX_ST_RADIO_OFF
- | 1 << WIMAX_ST_READY);
- netif_tx_disable(wimax_dev->net_dev);
- netif_carrier_off(wimax_dev->net_dev);
- break;
- case __WIMAX_ST_INVALID:
- default:
- dev_err(dev, "SW BUG: wimax_dev %p is in unknown state %u\n",
- wimax_dev, wimax_dev->state);
- WARN_ON(1);
- goto out;
- }
-
- /* Execute the actions of entry to the new state */
- switch (new_state) {
- case __WIMAX_ST_NULL:
- dev_err(dev, "SW BUG: wimax_dev %p entering NULL state "
- "from %u\n", wimax_dev, wimax_dev->state);
- WARN_ON(1); /* Nobody can enter this state */
- break;
- case WIMAX_ST_DOWN:
- break;
- case __WIMAX_ST_QUIESCING:
- break;
- case WIMAX_ST_UNINITIALIZED:
- break;
- case WIMAX_ST_RADIO_OFF:
- break;
- case WIMAX_ST_READY:
- break;
- case WIMAX_ST_SCANNING:
- break;
- case WIMAX_ST_CONNECTING:
- break;
- case WIMAX_ST_CONNECTED:
- netif_carrier_on(wimax_dev->net_dev);
- netif_wake_queue(wimax_dev->net_dev);
- break;
- case __WIMAX_ST_INVALID:
- default:
- BUG();
- }
- __wimax_state_set(wimax_dev, new_state);
- if (!IS_ERR(stch_skb))
- wimax_gnl_re_state_change_send(wimax_dev, stch_skb, header);
-out:
- d_fnend(3, dev, "(wimax_dev %p new_state %u [old %u]) = void\n",
- wimax_dev, new_state, old_state);
-}
-
-
-/**
- * wimax_state_change - Set the current state of a WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor (properly referenced)
- * @new_state: New state to switch to
- *
- * This implements the state changes for the wimax devices. It will
- *
- * - verify that the state transition is legal (for now it'll just
- * print a warning if not) according to the table in
- * linux/wimax.h's documentation for 'enum wimax_st'.
- *
- * - perform the actions needed for leaving the current state and
- * whichever are needed for entering the new state.
- *
- * - issue a report to user space indicating the new state (and an
- * optional payload with information about the new state).
- *
- * NOTE: @wimax_dev must be locked
- */
-void wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
-{
- /*
- * A driver cannot take the wimax_dev out of the
- * __WIMAX_ST_NULL state unless by calling wimax_dev_add(). If
- * the wimax_dev's state is still NULL, we ignore any request
- * to change its state because it means it hasn't been yet
- * registered.
- *
- * There is no need to complain about it, as routines that
- * call this might be shared from different code paths that
- * are called before or after wimax_dev_add() has done its
- * job.
- */
- mutex_lock(&wimax_dev->mutex);
- if (wimax_dev->state > __WIMAX_ST_NULL)
- __wimax_state_change(wimax_dev, new_state);
- mutex_unlock(&wimax_dev->mutex);
-}
-EXPORT_SYMBOL_GPL(wimax_state_change);
-
-
-/**
- * wimax_state_get() - Return the current state of a WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * Returns: Current state of the device according to its driver.
- */
-enum wimax_st wimax_state_get(struct wimax_dev *wimax_dev)
-{
- enum wimax_st state;
- mutex_lock(&wimax_dev->mutex);
- state = wimax_dev->state;
- mutex_unlock(&wimax_dev->mutex);
- return state;
-}
-EXPORT_SYMBOL_GPL(wimax_state_get);
-
-
-/**
- * wimax_dev_init - initialize a newly allocated instance
- *
- * @wimax_dev: WiMAX device descriptor to initialize.
- *
- * Initializes fields of a freshly allocated @wimax_dev instance. This
- * function assumes that after allocation, the memory occupied by
- * @wimax_dev was zeroed.
- */
-void wimax_dev_init(struct wimax_dev *wimax_dev)
-{
- INIT_LIST_HEAD(&wimax_dev->id_table_node);
- __wimax_state_set(wimax_dev, __WIMAX_ST_NULL);
- mutex_init(&wimax_dev->mutex);
- mutex_init(&wimax_dev->mutex_reset);
-}
-EXPORT_SYMBOL_GPL(wimax_dev_init);
-
-static const struct nla_policy wimax_gnl_policy[WIMAX_GNL_ATTR_MAX + 1] = {
- [WIMAX_GNL_RESET_IFIDX] = { .type = NLA_U32, },
- [WIMAX_GNL_RFKILL_IFIDX] = { .type = NLA_U32, },
- [WIMAX_GNL_RFKILL_STATE] = {
- .type = NLA_U32 /* enum wimax_rf_state */
- },
- [WIMAX_GNL_STGET_IFIDX] = { .type = NLA_U32, },
- [WIMAX_GNL_MSG_IFIDX] = { .type = NLA_U32, },
- [WIMAX_GNL_MSG_DATA] = {
- .type = NLA_UNSPEC, /* libnl doesn't grok BINARY yet */
- },
-};
-
-static const struct genl_ops wimax_gnl_ops[] = {
- {
- .cmd = WIMAX_GNL_OP_MSG_FROM_USER,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .flags = GENL_ADMIN_PERM,
- .doit = wimax_gnl_doit_msg_from_user,
- },
- {
- .cmd = WIMAX_GNL_OP_RESET,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .flags = GENL_ADMIN_PERM,
- .doit = wimax_gnl_doit_reset,
- },
- {
- .cmd = WIMAX_GNL_OP_RFKILL,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .flags = GENL_ADMIN_PERM,
- .doit = wimax_gnl_doit_rfkill,
- },
- {
- .cmd = WIMAX_GNL_OP_STATE_GET,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .flags = GENL_ADMIN_PERM,
- .doit = wimax_gnl_doit_state_get,
- },
-};
-
-
-static
-size_t wimax_addr_scnprint(char *addr_str, size_t addr_str_size,
- unsigned char *addr, size_t addr_len)
-{
- unsigned int cnt, total;
-
- for (total = cnt = 0; cnt < addr_len; cnt++)
- total += scnprintf(addr_str + total, addr_str_size - total,
- "%02x%c", addr[cnt],
- cnt == addr_len - 1 ? '\0' : ':');
- return total;
-}
-
-
-/**
- * wimax_dev_add - Register a new WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor (as embedded in your @net_dev's
- * priv data). You must have called wimax_dev_init() on it before.
- *
- * @net_dev: net device the @wimax_dev is associated with. The
- * function expects SET_NETDEV_DEV() and register_netdev() were
- * already called on it.
- *
- * Registers the new WiMAX device, sets up the user-kernel control
- * interface (generic netlink) and common WiMAX infrastructure.
- *
- * Note that the parts that will allow interaction with user space are
- * setup at the very end, when the rest is in place, as once that
- * happens, the driver might get user space control requests via
- * netlink or from debugfs that might translate into calls into
- * wimax_dev->op_*().
- */
-int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev)
-{
- int result;
- struct device *dev = net_dev->dev.parent;
- char addr_str[32];
-
- d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev);
-
- /* Do the RFKILL setup before locking, as RFKILL will call
- * into our functions.
- */
- wimax_dev->net_dev = net_dev;
- result = wimax_rfkill_add(wimax_dev);
- if (result < 0)
- goto error_rfkill_add;
-
- /* Set up user-space interaction */
- mutex_lock(&wimax_dev->mutex);
- wimax_id_table_add(wimax_dev);
- wimax_debugfs_add(wimax_dev);
-
- __wimax_state_set(wimax_dev, WIMAX_ST_DOWN);
- mutex_unlock(&wimax_dev->mutex);
-
- wimax_addr_scnprint(addr_str, sizeof(addr_str),
- net_dev->dev_addr, net_dev->addr_len);
- dev_err(dev, "WiMAX interface %s (%s) ready\n",
- net_dev->name, addr_str);
- d_fnend(3, dev, "(wimax_dev %p net_dev %p) = 0\n", wimax_dev, net_dev);
- return 0;
-
-error_rfkill_add:
- d_fnend(3, dev, "(wimax_dev %p net_dev %p) = %d\n",
- wimax_dev, net_dev, result);
- return result;
-}
-EXPORT_SYMBOL_GPL(wimax_dev_add);
-
-
-/**
- * wimax_dev_rm - Unregister an existing WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * Unregisters a WiMAX device previously registered for use with
- * wimax_add_rm().
- *
- * IMPORTANT! Must call before calling unregister_netdev().
- *
- * After this function returns, you will not get any more user space
- * control requests (via netlink or debugfs) and thus to wimax_dev->ops.
- *
- * Reentrancy control is ensured by setting the state to
- * %__WIMAX_ST_QUIESCING. rfkill operations coming through
- * wimax_*rfkill*() will be stopped by the quiescing state; ops coming
- * from the rfkill subsystem will be stopped by the support being
- * removed by wimax_rfkill_rm().
- */
-void wimax_dev_rm(struct wimax_dev *wimax_dev)
-{
- d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev);
-
- mutex_lock(&wimax_dev->mutex);
- __wimax_state_change(wimax_dev, __WIMAX_ST_QUIESCING);
- wimax_debugfs_rm(wimax_dev);
- wimax_id_table_rm(wimax_dev);
- __wimax_state_change(wimax_dev, WIMAX_ST_DOWN);
- mutex_unlock(&wimax_dev->mutex);
- wimax_rfkill_rm(wimax_dev);
- d_fnend(3, NULL, "(wimax_dev %p) = void\n", wimax_dev);
-}
-EXPORT_SYMBOL_GPL(wimax_dev_rm);
-
-
-/* Debug framework control of debug levels */
-struct d_level D_LEVEL[] = {
- D_SUBMODULE_DEFINE(debugfs),
- D_SUBMODULE_DEFINE(id_table),
- D_SUBMODULE_DEFINE(op_msg),
- D_SUBMODULE_DEFINE(op_reset),
- D_SUBMODULE_DEFINE(op_rfkill),
- D_SUBMODULE_DEFINE(op_state_get),
- D_SUBMODULE_DEFINE(stack),
-};
-size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
-
-
-static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
- { .name = "msg", },
-};
-
-struct genl_family wimax_gnl_family __ro_after_init = {
- .name = "WiMAX",
- .version = WIMAX_GNL_VERSION,
- .hdrsize = 0,
- .maxattr = WIMAX_GNL_ATTR_MAX,
- .policy = wimax_gnl_policy,
- .module = THIS_MODULE,
- .ops = wimax_gnl_ops,
- .n_ops = ARRAY_SIZE(wimax_gnl_ops),
- .mcgrps = wimax_gnl_mcgrps,
- .n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps),
-};
-
-
-
-/* Shutdown the wimax stack */
-static
-int __init wimax_subsys_init(void)
-{
- int result;
-
- d_fnstart(4, NULL, "()\n");
- d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
- "wimax.debug");
-
- result = genl_register_family(&wimax_gnl_family);
- if (unlikely(result < 0)) {
- pr_err("cannot register generic netlink family: %d\n", result);
- goto error_register_family;
- }
-
- d_fnend(4, NULL, "() = 0\n");
- return 0;
-
-error_register_family:
- d_fnend(4, NULL, "() = %d\n", result);
- return result;
-
-}
-module_init(wimax_subsys_init);
-
-
-/* Shutdown the wimax stack */
-static
-void __exit wimax_subsys_exit(void)
-{
- wimax_id_table_release();
- genl_unregister_family(&wimax_gnl_family);
-}
-module_exit(wimax_subsys_exit);
-
-MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
-MODULE_DESCRIPTION("Linux WiMAX stack");
-MODULE_LICENSE("GPL");
diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h
deleted file mode 100644
index 40751207296c..000000000000
--- a/net/wimax/wimax-internal.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Linux WiMAX
- * Internal API for kernel space WiMAX stack
- *
- * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This header file is for declarations and definitions internal to
- * the WiMAX stack. For public APIs and documentation, see
- * include/net/wimax.h and include/linux/wimax.h.
- */
-
-#ifndef __WIMAX_INTERNAL_H__
-#define __WIMAX_INTERNAL_H__
-#ifdef __KERNEL__
-
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/device.h>
-#include <net/wimax.h>
-
-
-/*
- * Decide if a (locked) device is ready for use
- *
- * Before using the device structure, it must be locked
- * (wimax_dev->mutex). As well, most operations need to call this
- * function to check if the state is the right one.
- *
- * An error value will be returned if the state is not the right
- * one. In that case, the caller should not attempt to use the device
- * and just unlock it.
- */
-static inline __must_check
-int wimax_dev_is_ready(struct wimax_dev *wimax_dev)
-{
- if (wimax_dev->state == __WIMAX_ST_NULL)
- return -EINVAL; /* Device is not even registered! */
- if (wimax_dev->state == WIMAX_ST_DOWN)
- return -ENOMEDIUM;
- if (wimax_dev->state == __WIMAX_ST_QUIESCING)
- return -ESHUTDOWN;
- return 0;
-}
-
-
-static inline
-void __wimax_state_set(struct wimax_dev *wimax_dev, enum wimax_st state)
-{
- wimax_dev->state = state;
-}
-void __wimax_state_change(struct wimax_dev *, enum wimax_st);
-
-#ifdef CONFIG_DEBUG_FS
-void wimax_debugfs_add(struct wimax_dev *);
-void wimax_debugfs_rm(struct wimax_dev *);
-#else
-static inline void wimax_debugfs_add(struct wimax_dev *wimax_dev) {}
-static inline void wimax_debugfs_rm(struct wimax_dev *wimax_dev) {}
-#endif
-
-void wimax_id_table_add(struct wimax_dev *);
-struct wimax_dev *wimax_dev_get_by_genl_info(struct genl_info *, int);
-void wimax_id_table_rm(struct wimax_dev *);
-void wimax_id_table_release(void);
-
-int wimax_rfkill_add(struct wimax_dev *);
-void wimax_rfkill_rm(struct wimax_dev *);
-
-/* generic netlink */
-extern struct genl_family wimax_gnl_family;
-
-/* ops */
-int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info);
-int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info);
-int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info);
-int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info);
-
-#endif /* #ifdef __KERNEL__ */
-#endif /* #ifndef __WIMAX_INTERNAL_H__ */
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index 27026f587fa6..f620acd2a0f5 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -21,6 +21,7 @@ config CFG80211
tristate "cfg80211 - wireless configuration API"
depends on RFKILL || !RFKILL
select FW_LOADER
+ select CRC32
# may need to update this when certificates are changed and are
# using a different algorithm, though right now they shouldn't
# (this is here rather than below to allow it to be a module)
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 6a6f2f214c10..e4030f1fbc60 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -141,9 +141,62 @@ static bool cfg80211_edmg_chandef_valid(const struct cfg80211_chan_def *chandef)
return true;
}
+static int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width)
+{
+ int mhz;
+
+ switch (chan_width) {
+ case NL80211_CHAN_WIDTH_1:
+ mhz = 1;
+ break;
+ case NL80211_CHAN_WIDTH_2:
+ mhz = 2;
+ break;
+ case NL80211_CHAN_WIDTH_4:
+ mhz = 4;
+ break;
+ case NL80211_CHAN_WIDTH_8:
+ mhz = 8;
+ break;
+ case NL80211_CHAN_WIDTH_16:
+ mhz = 16;
+ break;
+ case NL80211_CHAN_WIDTH_5:
+ mhz = 5;
+ break;
+ case NL80211_CHAN_WIDTH_10:
+ mhz = 10;
+ break;
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ mhz = 20;
+ break;
+ case NL80211_CHAN_WIDTH_40:
+ mhz = 40;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_80:
+ mhz = 80;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ mhz = 160;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+ return mhz;
+}
+
+static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
+{
+ return nl80211_chan_width_to_mhz(c->width);
+}
+
bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
{
- u32 control_freq;
+ u32 control_freq, oper_freq;
+ int oper_width, control_width;
if (!chandef->chan)
return false;
@@ -154,11 +207,6 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
control_freq = chandef->chan->center_freq;
switch (chandef->width) {
- case NL80211_CHAN_WIDTH_1:
- case NL80211_CHAN_WIDTH_2:
- case NL80211_CHAN_WIDTH_4:
- case NL80211_CHAN_WIDTH_8:
- case NL80211_CHAN_WIDTH_16:
case NL80211_CHAN_WIDTH_5:
case NL80211_CHAN_WIDTH_10:
case NL80211_CHAN_WIDTH_20:
@@ -169,6 +217,34 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
if (chandef->center_freq2)
return false;
break;
+ case NL80211_CHAN_WIDTH_1:
+ case NL80211_CHAN_WIDTH_2:
+ case NL80211_CHAN_WIDTH_4:
+ case NL80211_CHAN_WIDTH_8:
+ case NL80211_CHAN_WIDTH_16:
+ if (chandef->chan->band != NL80211_BAND_S1GHZ)
+ return false;
+
+ control_freq = ieee80211_channel_to_khz(chandef->chan);
+ oper_freq = ieee80211_chandef_to_khz(chandef);
+ control_width = nl80211_chan_width_to_mhz(
+ ieee80211_s1g_channel_width(
+ chandef->chan));
+ oper_width = cfg80211_chandef_get_width(chandef);
+
+ if (oper_width < 0 || control_width < 0)
+ return false;
+ if (chandef->center_freq2)
+ return false;
+
+ if (control_freq + MHZ_TO_KHZ(control_width) / 2 >
+ oper_freq + MHZ_TO_KHZ(oper_width) / 2)
+ return false;
+
+ if (control_freq - MHZ_TO_KHZ(control_width) / 2 <
+ oper_freq - MHZ_TO_KHZ(oper_width) / 2)
+ return false;
+ break;
case NL80211_CHAN_WIDTH_40:
if (chandef->center_freq1 != control_freq + 10 &&
chandef->center_freq1 != control_freq - 10)
@@ -264,53 +340,6 @@ static void chandef_primary_freqs(const struct cfg80211_chan_def *c,
}
}
-static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
-{
- int width;
-
- switch (c->width) {
- case NL80211_CHAN_WIDTH_1:
- width = 1;
- break;
- case NL80211_CHAN_WIDTH_2:
- width = 2;
- break;
- case NL80211_CHAN_WIDTH_4:
- width = 4;
- break;
- case NL80211_CHAN_WIDTH_8:
- width = 8;
- break;
- case NL80211_CHAN_WIDTH_16:
- width = 16;
- break;
- case NL80211_CHAN_WIDTH_5:
- width = 5;
- break;
- case NL80211_CHAN_WIDTH_10:
- width = 10;
- break;
- case NL80211_CHAN_WIDTH_20:
- case NL80211_CHAN_WIDTH_20_NOHT:
- width = 20;
- break;
- case NL80211_CHAN_WIDTH_40:
- width = 40;
- break;
- case NL80211_CHAN_WIDTH_80P80:
- case NL80211_CHAN_WIDTH_80:
- width = 80;
- break;
- case NL80211_CHAN_WIDTH_160:
- width = 160;
- break;
- default:
- WARN_ON_ONCE(1);
- return -1;
- }
- return width;
-}
-
const struct cfg80211_chan_def *
cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
const struct cfg80211_chan_def *c2)
@@ -501,10 +530,10 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_MONITOR:
case NL80211_IFTYPE_AP_VLAN:
- case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_P2P_DEVICE:
case NL80211_IFTYPE_NAN:
break;
+ case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_UNSPECIFIED:
case NUM_NL80211_IFTYPES:
WARN_ON(1);
@@ -648,12 +677,12 @@ bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev)
case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_MONITOR:
case NL80211_IFTYPE_AP_VLAN:
- case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_P2P_DEVICE:
/* Can NAN type be considered as beaconing interface? */
case NL80211_IFTYPE_NAN:
break;
case NL80211_IFTYPE_UNSPECIFIED:
+ case NL80211_IFTYPE_WDS:
case NUM_NL80211_IFTYPES:
WARN_ON(1);
}
@@ -1295,12 +1324,12 @@ cfg80211_get_chan_state(struct wireless_dev *wdev,
break;
case NL80211_IFTYPE_MONITOR:
case NL80211_IFTYPE_AP_VLAN:
- case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_P2P_DEVICE:
case NL80211_IFTYPE_NAN:
/* these interface types don't really have a channel */
return;
case NL80211_IFTYPE_UNSPECIFIED:
+ case NL80211_IFTYPE_WDS:
case NUM_NL80211_IFTYPES:
WARN_ON(1);
}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 354b0ccbdc24..4b1f35e976e7 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -236,7 +236,9 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
rdev->opencount--;
if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
- if (WARN_ON(!rdev->scan_req->notified))
+ if (WARN_ON(!rdev->scan_req->notified &&
+ (!rdev->int_scan_req ||
+ !rdev->int_scan_req->notified)))
rdev->scan_req->info.aborted = true;
___cfg80211_scan_done(rdev, false);
}
@@ -629,10 +631,8 @@ static int wiphy_verify_combinations(struct wiphy *wiphy)
return -EINVAL;
}
-#ifndef CONFIG_WIRELESS_WDS
if (WARN_ON(all_iftypes & BIT(NL80211_IFTYPE_WDS)))
return -EINVAL;
-#endif
/* You can't even choose that many! */
if (WARN_ON(cnt < c->max_interfaces))
@@ -673,10 +673,8 @@ int wiphy_register(struct wiphy *wiphy)
!(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ)))))
return -EINVAL;
-#ifndef CONFIG_WIRELESS_WDS
if (WARN_ON(wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS)))
return -EINVAL;
-#endif
if (WARN_ON(wiphy->pmsr_capa && !wiphy->pmsr_capa->ftm.supported))
return -EINVAL;
@@ -1200,9 +1198,6 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
case NL80211_IFTYPE_OCB:
__cfg80211_leave_ocb(rdev, dev);
break;
- case NL80211_IFTYPE_WDS:
- /* must be handled by mac80211/driver, has no APIs */
- break;
case NL80211_IFTYPE_P2P_DEVICE:
case NL80211_IFTYPE_NAN:
/* cannot happen, has no netdev */
@@ -1212,6 +1207,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
/* nothing to do */
break;
case NL80211_IFTYPE_UNSPECIFIED:
+ case NL80211_IFTYPE_WDS:
case NUM_NL80211_IFTYPES:
/* invalid */
break;
@@ -1248,8 +1244,7 @@ void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
}
EXPORT_SYMBOL(cfg80211_stop_iface);
-void cfg80211_init_wdev(struct cfg80211_registered_device *rdev,
- struct wireless_dev *wdev)
+void cfg80211_init_wdev(struct wireless_dev *wdev)
{
mutex_init(&wdev->mtx);
INIT_LIST_HEAD(&wdev->event_list);
@@ -1260,6 +1255,30 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev,
spin_lock_init(&wdev->pmsr_lock);
INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk);
+#ifdef CONFIG_CFG80211_WEXT
+ wdev->wext.default_key = -1;
+ wdev->wext.default_mgmt_key = -1;
+ wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
+#endif
+
+ if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT)
+ wdev->ps = true;
+ else
+ wdev->ps = false;
+ /* allow mac80211 to determine the timeout */
+ wdev->ps_timeout = -1;
+
+ if ((wdev->iftype == NL80211_IFTYPE_STATION ||
+ wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
+ wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
+ wdev->netdev->priv_flags |= IFF_DONT_BRIDGE;
+
+ INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
+}
+
+void cfg80211_register_wdev(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
/*
* We get here also when the interface changes network namespaces,
* as it's registered into the new one, but we don't want it to
@@ -1293,6 +1312,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
switch (state) {
case NETDEV_POST_INIT:
SET_NETDEV_DEVTYPE(dev, &wiphy_type);
+ wdev->netdev = dev;
+ /* can only change netns with wiphy */
+ dev->features |= NETIF_F_NETNS_LOCAL;
+
+ cfg80211_init_wdev(wdev);
break;
case NETDEV_REGISTER:
/*
@@ -1300,35 +1324,12 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
* called within code protected by it when interfaces
* are added with nl80211.
*/
- /* can only change netns with wiphy */
- dev->features |= NETIF_F_NETNS_LOCAL;
-
if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj,
"phy80211")) {
pr_err("failed to add phy80211 symlink to netdev!\n");
}
- wdev->netdev = dev;
-#ifdef CONFIG_CFG80211_WEXT
- wdev->wext.default_key = -1;
- wdev->wext.default_mgmt_key = -1;
- wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
-#endif
-
- if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT)
- wdev->ps = true;
- else
- wdev->ps = false;
- /* allow mac80211 to determine the timeout */
- wdev->ps_timeout = -1;
-
- if ((wdev->iftype == NL80211_IFTYPE_STATION ||
- wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
- wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
- dev->priv_flags |= IFF_DONT_BRIDGE;
-
- INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
- cfg80211_init_wdev(rdev, wdev);
+ cfg80211_register_wdev(rdev, wdev);
break;
case NETDEV_GOING_DOWN:
cfg80211_leave(rdev, wdev);
@@ -1336,7 +1337,9 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
case NETDEV_DOWN:
cfg80211_update_iface_num(rdev, wdev->iftype, -1);
if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
- if (WARN_ON(!rdev->scan_req->notified))
+ if (WARN_ON(!rdev->scan_req->notified &&
+ (!rdev->int_scan_req ||
+ !rdev->int_scan_req->notified)))
rdev->scan_req->info.aborted = true;
___cfg80211_scan_done(rdev, false);
}
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 67b0389fca4d..7df91f940212 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -3,7 +3,7 @@
* Wireless configuration interface internals.
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*/
#ifndef __NET_WIRELESS_CORE_H
#define __NET_WIRELESS_CORE_H
@@ -72,6 +72,7 @@ struct cfg80211_registered_device {
u32 bss_generation;
u32 bss_entries;
struct cfg80211_scan_request *scan_req; /* protected by RTNL */
+ struct cfg80211_scan_request *int_scan_req;
struct sk_buff *scan_msg;
struct list_head sched_scan_req_list;
time64_t suspend_at;
@@ -208,8 +209,9 @@ struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);
int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
struct net *net);
-void cfg80211_init_wdev(struct cfg80211_registered_device *rdev,
- struct wireless_dev *wdev);
+void cfg80211_init_wdev(struct wireless_dev *wdev);
+void cfg80211_register_wdev(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev);
static inline void wdev_lock(struct wireless_dev *wdev)
__acquires(wdev)
@@ -431,6 +433,8 @@ void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev);
/* internal helpers */
bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher);
+bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev,
+ int key_idx, bool pairwise);
int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
struct key_params *params, int key_idx,
bool pairwise, const u8 *mac_addr);
@@ -457,6 +461,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev);
bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
u32 center_freq_khz, u32 bw_khz);
+int cfg80211_scan(struct cfg80211_registered_device *rdev);
+
extern struct work_struct cfg80211_disconnect_work;
/**
@@ -466,8 +472,8 @@ extern struct work_struct cfg80211_disconnect_work;
*
* Checks if chandef is usable and we can/need start CAC on such channel.
*
- * Return: Return true if all channels available and at least
- * one channel require CAC (NL80211_DFS_USABLE)
+ * Return: true if all channels available and at least
+ * one channel requires CAC (NL80211_DFS_USABLE)
*/
bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
const struct cfg80211_chan_def *chandef);
diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c
index cc7b9fd5c166..d66a913027e0 100644
--- a/net/wireless/lib80211.c
+++ b/net/wireless/lib80211.c
@@ -26,8 +26,6 @@
#include <net/lib80211.h>
-#define DRV_NAME "lib80211"
-
#define DRV_DESCRIPTION "common routines for IEEE802.11 drivers"
MODULE_DESCRIPTION(DRV_DESCRIPTION);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index db7333e20dd7..e1e90761dc00 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -4,7 +4,7 @@
*
* Copyright (c) 2009, Jouni Malinen <j@w1.fi>
* Copyright (c) 2015 Intel Deutschland GmbH
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2020 Intel Corporation
*/
#include <linux/kernel.h>
@@ -30,6 +30,15 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
struct cfg80211_connect_resp_params cr;
+ const u8 *resp_ie = mgmt->u.assoc_resp.variable;
+ size_t resp_ie_len = len - offsetof(struct ieee80211_mgmt,
+ u.assoc_resp.variable);
+
+ if (bss->channel->band == NL80211_BAND_S1GHZ) {
+ resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable;
+ resp_ie_len = len - offsetof(struct ieee80211_mgmt,
+ u.s1g_assoc_resp.variable);
+ }
memset(&cr, 0, sizeof(cr));
cr.status = (int)le16_to_cpu(mgmt->u.assoc_resp.status_code);
@@ -37,9 +46,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
cr.bss = bss;
cr.req_ie = req_ies;
cr.req_ie_len = req_ies_len;
- cr.resp_ie = mgmt->u.assoc_resp.variable;
- cr.resp_ie_len =
- len - offsetof(struct ieee80211_mgmt, u.assoc_resp.variable);
+ cr.resp_ie = resp_ie;
+ cr.resp_ie_len = resp_ie_len;
cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED;
trace_cfg80211_send_rx_assoc(dev, bss);
@@ -73,7 +81,8 @@ static void cfg80211_process_auth(struct wireless_dev *wdev,
}
static void cfg80211_process_deauth(struct wireless_dev *wdev,
- const u8 *buf, size_t len)
+ const u8 *buf, size_t len,
+ bool reconnect)
{
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
@@ -81,7 +90,7 @@ static void cfg80211_process_deauth(struct wireless_dev *wdev,
u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr);
- nl80211_send_deauth(rdev, wdev->netdev, buf, len, GFP_KERNEL);
+ nl80211_send_deauth(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL);
if (!wdev->current_bss ||
!ether_addr_equal(wdev->current_bss->pub.bssid, bssid))
@@ -92,7 +101,8 @@ static void cfg80211_process_deauth(struct wireless_dev *wdev,
}
static void cfg80211_process_disassoc(struct wireless_dev *wdev,
- const u8 *buf, size_t len)
+ const u8 *buf, size_t len,
+ bool reconnect)
{
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
@@ -100,7 +110,8 @@ static void cfg80211_process_disassoc(struct wireless_dev *wdev,
u16 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr);
- nl80211_send_disassoc(rdev, wdev->netdev, buf, len, GFP_KERNEL);
+ nl80211_send_disassoc(rdev, wdev->netdev, buf, len, reconnect,
+ GFP_KERNEL);
if (WARN_ON(!wdev->current_bss ||
!ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
@@ -125,9 +136,9 @@ void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len)
if (ieee80211_is_auth(mgmt->frame_control))
cfg80211_process_auth(wdev, buf, len);
else if (ieee80211_is_deauth(mgmt->frame_control))
- cfg80211_process_deauth(wdev, buf, len);
+ cfg80211_process_deauth(wdev, buf, len, false);
else if (ieee80211_is_disassoc(mgmt->frame_control))
- cfg80211_process_disassoc(wdev, buf, len);
+ cfg80211_process_disassoc(wdev, buf, len, false);
}
EXPORT_SYMBOL(cfg80211_rx_mlme_mgmt);
@@ -172,22 +183,23 @@ void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss)
}
EXPORT_SYMBOL(cfg80211_abandon_assoc);
-void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len)
+void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len,
+ bool reconnect)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct ieee80211_mgmt *mgmt = (void *)buf;
ASSERT_WDEV_LOCK(wdev);
- trace_cfg80211_tx_mlme_mgmt(dev, buf, len);
+ trace_cfg80211_tx_mlme_mgmt(dev, buf, len, reconnect);
if (WARN_ON(len < 2))
return;
if (ieee80211_is_deauth(mgmt->frame_control))
- cfg80211_process_deauth(wdev, buf, len);
+ cfg80211_process_deauth(wdev, buf, len, reconnect);
else
- cfg80211_process_disassoc(wdev, buf, len);
+ cfg80211_process_disassoc(wdev, buf, len, reconnect);
}
EXPORT_SYMBOL(cfg80211_tx_mlme_mgmt);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7fd45f6ddb05..775d0c4d86c3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -209,14 +209,23 @@ static int validate_beacon_head(const struct nlattr *attr,
unsigned int len = nla_len(attr);
const struct element *elem;
const struct ieee80211_mgmt *mgmt = (void *)data;
- unsigned int fixedlen = offsetof(struct ieee80211_mgmt,
- u.beacon.variable);
+ bool s1g_bcn = ieee80211_is_s1g_beacon(mgmt->frame_control);
+ unsigned int fixedlen, hdrlen;
+
+ if (s1g_bcn) {
+ fixedlen = offsetof(struct ieee80211_ext,
+ u.s1g_beacon.variable);
+ hdrlen = offsetof(struct ieee80211_ext, u.s1g_beacon);
+ } else {
+ fixedlen = offsetof(struct ieee80211_mgmt,
+ u.beacon.variable);
+ hdrlen = offsetof(struct ieee80211_mgmt, u.beacon);
+ }
if (len < fixedlen)
goto err;
- if (ieee80211_hdrlen(mgmt->frame_control) !=
- offsetof(struct ieee80211_mgmt, u.beacon))
+ if (ieee80211_hdrlen(mgmt->frame_control) != hdrlen)
goto err;
data += fixedlen;
@@ -320,6 +329,13 @@ he_obss_pd_policy[NL80211_HE_OBSS_PD_ATTR_MAX + 1] = {
NLA_POLICY_RANGE(NLA_U8, 1, 20),
[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET] =
NLA_POLICY_RANGE(NLA_U8, 1, 20),
+ [NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET] =
+ NLA_POLICY_RANGE(NLA_U8, 1, 20),
+ [NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP] =
+ NLA_POLICY_EXACT_LEN(8),
+ [NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP] =
+ NLA_POLICY_EXACT_LEN(8),
+ [NL80211_HE_OBSS_PD_ATTR_SR_CTRL] = { .type = NLA_U8 },
};
static const struct nla_policy
@@ -336,6 +352,13 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
.len = NL80211_MAX_SUPP_HT_RATES },
[NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)),
[NL80211_TXRATE_GI] = { .type = NLA_U8 },
+ [NL80211_TXRATE_HE] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_he)),
+ [NL80211_TXRATE_HE_GI] = NLA_POLICY_RANGE(NLA_U8,
+ NL80211_RATE_INFO_HE_GI_0_8,
+ NL80211_RATE_INFO_HE_GI_3_2),
+ [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8,
+ NL80211_RATE_INFO_HE_1XLTF,
+ NL80211_RATE_INFO_HE_4XLTF),
};
static const struct nla_policy
@@ -360,6 +383,34 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = {
NLA_POLICY_NESTED(nl80211_txattr_policy),
};
+static const struct nla_policy
+nl80211_fils_discovery_policy[NL80211_FILS_DISCOVERY_ATTR_MAX + 1] = {
+ [NL80211_FILS_DISCOVERY_ATTR_INT_MIN] = NLA_POLICY_MAX(NLA_U32, 10000),
+ [NL80211_FILS_DISCOVERY_ATTR_INT_MAX] = NLA_POLICY_MAX(NLA_U32, 10000),
+ NLA_POLICY_RANGE(NLA_BINARY,
+ NL80211_FILS_DISCOVERY_TMPL_MIN_LEN,
+ IEEE80211_MAX_DATA_LEN),
+};
+
+static const struct nla_policy
+nl80211_unsol_bcast_probe_resp_policy[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1] = {
+ [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] = NLA_POLICY_MAX(NLA_U32, 20),
+ [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN }
+};
+
+static const struct nla_policy
+sar_specs_policy[NL80211_SAR_ATTR_SPECS_MAX + 1] = {
+ [NL80211_SAR_ATTR_SPECS_POWER] = { .type = NLA_S32 },
+ [NL80211_SAR_ATTR_SPECS_RANGE_INDEX] = {.type = NLA_U32 },
+};
+
+static const struct nla_policy
+sar_policy[NL80211_SAR_ATTR_MAX + 1] = {
+ [NL80211_SAR_ATTR_TYPE] = NLA_POLICY_MAX(NLA_U32, NUM_NL80211_SAR_TYPE),
+ [NL80211_SAR_ATTR_SPECS] = NLA_POLICY_NESTED_ARRAY(sar_specs_policy),
+};
+
static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD },
[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
@@ -539,7 +590,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_BG_SCAN_PERIOD] = { .type = NLA_U16 },
[NL80211_ATTR_WDEV] = { .type = NLA_U64 },
[NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 },
- [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, },
+
+ /* need to include at least Auth Transaction and Status Code */
+ [NL80211_ATTR_AUTH_DATA] = NLA_POLICY_MIN_LEN(4),
+
[NL80211_ATTR_VHT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_VHT_CAPABILITY_LEN),
[NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 },
[NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127),
@@ -561,23 +615,30 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY,
.len = IEEE80211_MAX_DATA_LEN },
[NL80211_ATTR_CRIT_PROT_ID] = { .type = NLA_U16 },
- [NL80211_ATTR_MAX_CRIT_PROT_DURATION] = { .type = NLA_U16 },
+ [NL80211_ATTR_MAX_CRIT_PROT_DURATION] =
+ NLA_POLICY_MAX(NLA_U16, NL80211_CRIT_PROTO_MAX_DURATION),
[NL80211_ATTR_PEER_AID] =
NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
[NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 },
[NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG },
[NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED },
- [NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_BINARY },
- [NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_BINARY },
- [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = { .type = NLA_BINARY },
- [NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] = { .type = NLA_BINARY },
+ [NL80211_ATTR_CNTDWN_OFFS_BEACON] = { .type = NLA_BINARY },
+ [NL80211_ATTR_CNTDWN_OFFS_PRESP] = { .type = NLA_BINARY },
+ [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = NLA_POLICY_MIN_LEN(2),
+ /*
+ * The value of the Length field of the Supported Operating
+ * Classes element is between 2 and 253.
+ */
+ [NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] =
+ NLA_POLICY_RANGE(NLA_BINARY, 2, 253),
[NL80211_ATTR_HANDLE_DFS] = { .type = NLA_FLAG },
[NL80211_ATTR_OPMODE_NOTIF] = { .type = NLA_U8 },
[NL80211_ATTR_VENDOR_ID] = { .type = NLA_U32 },
[NL80211_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 },
[NL80211_ATTR_VENDOR_DATA] = { .type = NLA_BINARY },
- [NL80211_ATTR_QOS_MAP] = { .type = NLA_BINARY,
- .len = IEEE80211_QOS_MAP_LEN_MAX },
+ [NL80211_ATTR_QOS_MAP] = NLA_POLICY_RANGE(NLA_BINARY,
+ IEEE80211_QOS_MAP_LEN_MIN,
+ IEEE80211_QOS_MAP_LEN_MAX),
[NL80211_ATTR_MAC_HINT] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
[NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 },
[NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 },
@@ -625,15 +686,17 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
.len = FILS_ERP_MAX_RRK_LEN },
[NL80211_ATTR_FILS_CACHE_ID] = NLA_POLICY_EXACT_LEN_WARN(2),
[NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
+ [NL80211_ATTR_PMKR0_NAME] = NLA_POLICY_EXACT_LEN(WLAN_PMK_NAME_LEN),
[NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG },
[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG },
[NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
[NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
[NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
- [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY,
- .len = NL80211_HE_MAX_CAPABILITY_LEN },
-
+ [NL80211_ATTR_HE_CAPABILITY] =
+ NLA_POLICY_RANGE(NLA_BINARY,
+ NL80211_HE_MIN_CAPABILITY_LEN,
+ NL80211_HE_MAX_CAPABILITY_LEN),
[NL80211_ATTR_FTM_RESPONDER] =
NLA_POLICY_NESTED(nl80211_ftm_responder_policy),
[NL80211_ATTR_TIMEOUT] = NLA_POLICY_MIN(NLA_U32, 1),
@@ -654,10 +717,21 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG },
[NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999),
[NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED },
- [NL80211_ATTR_HE_6GHZ_CAPABILITY] = {
- .type = NLA_EXACT_LEN,
- .len = sizeof(struct ieee80211_he_6ghz_capa),
- },
+ [NL80211_ATTR_HE_6GHZ_CAPABILITY] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct ieee80211_he_6ghz_capa)),
+ [NL80211_ATTR_FILS_DISCOVERY] =
+ NLA_POLICY_NESTED(nl80211_fils_discovery_policy),
+ [NL80211_ATTR_UNSOL_BCAST_PROBE_RESP] =
+ NLA_POLICY_NESTED(nl80211_unsol_bcast_probe_resp_policy),
+ [NL80211_ATTR_S1G_CAPABILITY] =
+ NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN),
+ [NL80211_ATTR_S1G_CAPABILITY_MASK] =
+ NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN),
+ [NL80211_ATTR_SAE_PWE] =
+ NLA_POLICY_RANGE(NLA_U8, NL80211_SAE_PWE_HUNT_AND_PECK,
+ NL80211_SAE_PWE_BOTH),
+ [NL80211_ATTR_RECONNECT_REQUESTED] = { .type = NLA_REJECT },
+ [NL80211_ATTR_SAR_SPEC] = NLA_POLICY_NESTED(sar_policy),
};
/* policy for the key attributes */
@@ -703,7 +777,7 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
[NL80211_WOWLAN_TCP_DST_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
[NL80211_WOWLAN_TCP_SRC_PORT] = { .type = NLA_U16 },
[NL80211_WOWLAN_TCP_DST_PORT] = { .type = NLA_U16 },
- [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = { .type = NLA_MIN_LEN, .len = 1 },
+ [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = NLA_POLICY_MIN_LEN(1),
[NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ] = {
.len = sizeof(struct nl80211_wowlan_tcp_data_seq)
},
@@ -711,8 +785,8 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
.len = sizeof(struct nl80211_wowlan_tcp_data_token)
},
[NL80211_WOWLAN_TCP_DATA_INTERVAL] = { .type = NLA_U32 },
- [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = { .type = NLA_MIN_LEN, .len = 1 },
- [NL80211_WOWLAN_TCP_WAKE_MASK] = { .type = NLA_MIN_LEN, .len = 1 },
+ [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = NLA_POLICY_MIN_LEN(1),
+ [NL80211_WOWLAN_TCP_WAKE_MASK] = NLA_POLICY_MIN_LEN(1),
};
#endif /* CONFIG_PM */
@@ -738,7 +812,7 @@ nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = {
.type = NLA_BINARY,
.len = NL80211_KCK_EXT_LEN
},
- [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN),
+ [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN(NL80211_REPLAY_CTR_LEN),
[NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 },
};
@@ -778,7 +852,8 @@ nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = {
/* policy for NAN function attributes */
static const struct nla_policy
nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = {
- [NL80211_NAN_FUNC_TYPE] = { .type = NLA_U8 },
+ [NL80211_NAN_FUNC_TYPE] =
+ NLA_POLICY_MAX(NLA_U8, NL80211_NAN_FUNC_MAX_TYPE),
[NL80211_NAN_FUNC_SERVICE_ID] = {
.len = NL80211_NAN_FUNC_SERVICE_ID_LEN },
[NL80211_NAN_FUNC_PUBLISH_TYPE] = { .type = NLA_U8 },
@@ -926,6 +1001,8 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
if (!large && chan->flags &
(IEEE80211_CHAN_NO_10MHZ | IEEE80211_CHAN_NO_20MHZ))
return 0;
+ if (!large && chan->freq_offset)
+ return 0;
if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_FREQ,
chan->center_freq))
@@ -992,6 +1069,21 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
if ((chan->flags & IEEE80211_CHAN_NO_HE) &&
nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE))
goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_1MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_2MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_4MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_8MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_16MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ))
+ goto nla_put_failure;
}
if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
@@ -1603,7 +1695,8 @@ nl80211_send_iftype_data(struct sk_buff *msg,
}
static int nl80211_send_band_rateinfo(struct sk_buff *msg,
- struct ieee80211_supported_band *sband)
+ struct ieee80211_supported_band *sband,
+ bool large)
{
struct nlattr *nl_rates, *nl_rate;
struct ieee80211_rate *rate;
@@ -1631,7 +1724,7 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
sband->vht_cap.cap)))
return -ENOBUFS;
- if (sband->n_iftype_data) {
+ if (large && sband->n_iftype_data) {
struct nlattr *nl_iftype_data =
nla_nest_start_noflag(msg,
NL80211_BAND_ATTR_IFTYPE_DATA);
@@ -1659,7 +1752,7 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
}
/* add EDMG info */
- if (sband->edmg_cap.channels &&
+ if (large && sband->edmg_cap.channels &&
(nla_put_u8(msg, NL80211_BAND_ATTR_EDMG_CHANNELS,
sband->edmg_cap.channels) ||
nla_put_u8(msg, NL80211_BAND_ATTR_EDMG_BW_CONFIG,
@@ -1806,7 +1899,6 @@ static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev,
if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL))
goto nla_put_failure;
}
- CMD(set_wds_peer, SET_WDS_PEER);
if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) {
CMD(tdls_mgmt, TDLS_MGMT);
CMD(tdls_oper, TDLS_OPER);
@@ -2016,6 +2108,56 @@ fail:
return -ENOBUFS;
}
+static int
+nl80211_put_sar_specs(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg)
+{
+ struct nlattr *sar_capa, *specs, *sub_freq_range;
+ u8 num_freq_ranges;
+ int i;
+
+ if (!rdev->wiphy.sar_capa)
+ return 0;
+
+ num_freq_ranges = rdev->wiphy.sar_capa->num_freq_ranges;
+
+ sar_capa = nla_nest_start(msg, NL80211_ATTR_SAR_SPEC);
+ if (!sar_capa)
+ return -ENOSPC;
+
+ if (nla_put_u32(msg, NL80211_SAR_ATTR_TYPE, rdev->wiphy.sar_capa->type))
+ goto fail;
+
+ specs = nla_nest_start(msg, NL80211_SAR_ATTR_SPECS);
+ if (!specs)
+ goto fail;
+
+ /* report supported freq_ranges */
+ for (i = 0; i < num_freq_ranges; i++) {
+ sub_freq_range = nla_nest_start(msg, i + 1);
+ if (!sub_freq_range)
+ goto fail;
+
+ if (nla_put_u32(msg, NL80211_SAR_ATTR_SPECS_START_FREQ,
+ rdev->wiphy.sar_capa->freq_ranges[i].start_freq))
+ goto fail;
+
+ if (nla_put_u32(msg, NL80211_SAR_ATTR_SPECS_END_FREQ,
+ rdev->wiphy.sar_capa->freq_ranges[i].end_freq))
+ goto fail;
+
+ nla_nest_end(msg, sub_freq_range);
+ }
+
+ nla_nest_end(msg, specs);
+ nla_nest_end(msg, sar_capa);
+
+ return 0;
+fail:
+ nla_nest_cancel(msg, sar_capa);
+ return -ENOBUFS;
+}
+
struct nl80211_dump_wiphy_state {
s64 filter_wiphy;
long start;
@@ -2077,13 +2219,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
nla_put_u16(msg, NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN,
rdev->wiphy.max_sched_scan_ie_len) ||
nla_put_u8(msg, NL80211_ATTR_MAX_MATCH_SETS,
- rdev->wiphy.max_match_sets) ||
- nla_put_u32(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_PLANS,
- rdev->wiphy.max_sched_scan_plans) ||
- nla_put_u32(msg, NL80211_ATTR_MAX_SCAN_PLAN_INTERVAL,
- rdev->wiphy.max_sched_scan_plan_interval) ||
- nla_put_u32(msg, NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS,
- rdev->wiphy.max_sched_scan_plan_iterations))
+ rdev->wiphy.max_match_sets))
goto nla_put_failure;
if ((rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) &&
@@ -2173,6 +2309,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
band < NUM_NL80211_BANDS; band++) {
struct ieee80211_supported_band *sband;
+ /* omit higher bands for ancient software */
+ if (band > NL80211_BAND_5GHZ && !state->split)
+ break;
+
sband = rdev->wiphy.bands[band];
if (!sband)
@@ -2184,7 +2324,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
switch (state->chan_start) {
case 0:
- if (nl80211_send_band_rateinfo(msg, sband))
+ if (nl80211_send_band_rateinfo(msg, sband,
+ state->split))
goto nla_put_failure;
state->chan_start++;
if (state->split)
@@ -2266,6 +2407,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
CMD(set_multicast_to_unicast, SET_MULTICAST_TO_UNICAST);
CMD(update_connect_params, UPDATE_CONNECT_PARAMS);
CMD(update_ft_ies, UPDATE_FT_IES);
+ if (rdev->wiphy.sar_capa)
+ CMD(set_sar_specs, SET_SAR_SPECS);
}
#undef CMD
@@ -2286,8 +2429,6 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
nla_put_flag(msg, NL80211_ATTR_OFFCHANNEL_TX_OK))
goto nla_put_failure;
- if (nl80211_send_mgmt_stypes(msg, mgmt_stypes))
- goto nla_put_failure;
state->split_start++;
if (state->split)
break;
@@ -2355,9 +2496,23 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
* case we'll continue with more data in the next round,
* but break unconditionally so unsplit data stops here.
*/
- state->split_start++;
+ if (state->split)
+ state->split_start++;
+ else
+ state->split_start = 0;
break;
case 9:
+ if (nl80211_send_mgmt_stypes(msg, mgmt_stypes))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_PLANS,
+ rdev->wiphy.max_sched_scan_plans) ||
+ nla_put_u32(msg, NL80211_ATTR_MAX_SCAN_PLAN_INTERVAL,
+ rdev->wiphy.max_sched_scan_plan_interval) ||
+ nla_put_u32(msg, NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS,
+ rdev->wiphy.max_sched_scan_plan_iterations))
+ goto nla_put_failure;
+
if (rdev->wiphy.extended_capabilities &&
(nla_put(msg, NL80211_ATTR_EXT_CAPA,
rdev->wiphy.extended_capabilities_len,
@@ -2579,6 +2734,11 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
if (nl80211_put_tid_config_support(rdev, msg))
goto nla_put_failure;
+ state->split_start++;
+ break;
+ case 16:
+ if (nl80211_put_sar_specs(rdev, msg))
+ goto nla_put_failure;
/* done */
state->split_start = 0;
@@ -2773,8 +2933,8 @@ static int parse_txq_params(struct nlattr *tb[],
static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
{
/*
- * You can only set the channel explicitly for WDS interfaces,
- * all others have their channel managed via their respective
+ * You can only set the channel explicitly for some interfaces,
+ * most have their channel managed via their respective
* "establish a connection" command (connect, join, ...)
*
* For AP/GO and mesh mode, the channel can be set with the
@@ -2979,29 +3139,6 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
return __nl80211_set_channel(rdev, netdev, info);
}
-static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info)
-{
- struct cfg80211_registered_device *rdev = info->user_ptr[0];
- struct net_device *dev = info->user_ptr[1];
- struct wireless_dev *wdev = dev->ieee80211_ptr;
- const u8 *bssid;
-
- if (!info->attrs[NL80211_ATTR_MAC])
- return -EINVAL;
-
- if (netif_running(dev))
- return -EBUSY;
-
- if (!rdev->ops->set_wds_peer)
- return -EOPNOTSUPP;
-
- if (wdev->iftype != NL80211_IFTYPE_WDS)
- return -EOPNOTSUPP;
-
- bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
- return rdev_set_wds_peer(rdev, dev, bssid);
-}
-
static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev;
@@ -3798,7 +3935,8 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
* P2P Device and NAN do not have a netdev, so don't go
* through the netdev notifier and must be added here
*/
- cfg80211_init_wdev(rdev, wdev);
+ cfg80211_init_wdev(wdev);
+ cfg80211_register_wdev(rdev, wdev);
break;
default:
break;
@@ -4172,9 +4310,6 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
- if (key.idx < 0)
- return -EINVAL;
-
if (info->attrs[NL80211_ATTR_MAC])
mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
@@ -4190,6 +4325,10 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
key.type != NL80211_KEYTYPE_GROUP)
return -EINVAL;
+ if (!cfg80211_valid_key_idx(rdev, key.idx,
+ key.type == NL80211_KEYTYPE_PAIRWISE))
+ return -EINVAL;
+
if (!rdev->ops->del_key)
return -EOPNOTSUPP;
@@ -4422,21 +4561,110 @@ static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband,
return true;
}
+static u16 he_mcs_map_to_mcs_mask(u8 he_mcs_map)
+{
+ switch (he_mcs_map) {
+ case IEEE80211_HE_MCS_NOT_SUPPORTED:
+ return 0;
+ case IEEE80211_HE_MCS_SUPPORT_0_7:
+ return 0x00FF;
+ case IEEE80211_HE_MCS_SUPPORT_0_9:
+ return 0x03FF;
+ case IEEE80211_HE_MCS_SUPPORT_0_11:
+ return 0xFFF;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void he_build_mcs_mask(u16 he_mcs_map,
+ u16 he_mcs_mask[NL80211_HE_NSS_MAX])
+{
+ u8 nss;
+
+ for (nss = 0; nss < NL80211_HE_NSS_MAX; nss++) {
+ he_mcs_mask[nss] = he_mcs_map_to_mcs_mask(he_mcs_map & 0x03);
+ he_mcs_map >>= 2;
+ }
+}
+
+static u16 he_get_txmcsmap(struct genl_info *info,
+ const struct ieee80211_sta_he_cap *he_cap)
+{
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ __le16 tx_mcs;
+
+ switch (wdev->chandef.width) {
+ case NL80211_CHAN_WIDTH_80P80:
+ tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80p80;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_160;
+ break;
+ default:
+ tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80;
+ break;
+ }
+ return le16_to_cpu(tx_mcs);
+}
+
+static bool he_set_mcs_mask(struct genl_info *info,
+ struct wireless_dev *wdev,
+ struct ieee80211_supported_band *sband,
+ struct nl80211_txrate_he *txrate,
+ u16 mcs[NL80211_HE_NSS_MAX])
+{
+ const struct ieee80211_sta_he_cap *he_cap;
+ u16 tx_mcs_mask[NL80211_HE_NSS_MAX] = {};
+ u16 tx_mcs_map = 0;
+ u8 i;
+
+ he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype);
+ if (!he_cap)
+ return false;
+
+ memset(mcs, 0, sizeof(u16) * NL80211_HE_NSS_MAX);
+
+ tx_mcs_map = he_get_txmcsmap(info, he_cap);
+
+ /* Build he_mcs_mask from HE capabilities */
+ he_build_mcs_mask(tx_mcs_map, tx_mcs_mask);
+
+ for (i = 0; i < NL80211_HE_NSS_MAX; i++) {
+ if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i])
+ mcs[i] = txrate->mcs[i];
+ else
+ return false;
+ }
+
+ return true;
+}
+
static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
struct nlattr *attrs[],
enum nl80211_attrs attr,
- struct cfg80211_bitrate_mask *mask)
+ struct cfg80211_bitrate_mask *mask,
+ struct net_device *dev,
+ bool default_all_enabled)
{
struct nlattr *tb[NL80211_TXRATE_MAX + 1];
struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
int rem, i;
struct nlattr *tx_rates;
struct ieee80211_supported_band *sband;
- u16 vht_tx_mcs_map;
+ u16 vht_tx_mcs_map, he_tx_mcs_map;
memset(mask, 0, sizeof(*mask));
/* Default to all rates enabled */
for (i = 0; i < NUM_NL80211_BANDS; i++) {
+ const struct ieee80211_sta_he_cap *he_cap;
+
+ if (!default_all_enabled)
+ break;
+
sband = rdev->wiphy.bands[i];
if (!sband)
@@ -4452,6 +4680,16 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs);
+
+ he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype);
+ if (!he_cap)
+ continue;
+
+ he_tx_mcs_map = he_get_txmcsmap(info, he_cap);
+ he_build_mcs_mask(he_tx_mcs_map, mask->control[i].he_mcs);
+
+ mask->control[i].he_gi = 0xFF;
+ mask->control[i].he_ltf = 0xFF;
}
/* if no rates are given set it back to the defaults */
@@ -4494,6 +4732,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
mask->control[band].ht_mcs))
return -EINVAL;
}
+
if (tb[NL80211_TXRATE_VHT]) {
if (!vht_set_mcs_mask(
sband,
@@ -4501,19 +4740,33 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
mask->control[band].vht_mcs))
return -EINVAL;
}
+
if (tb[NL80211_TXRATE_GI]) {
mask->control[band].gi =
nla_get_u8(tb[NL80211_TXRATE_GI]);
if (mask->control[band].gi > NL80211_TXRATE_FORCE_LGI)
return -EINVAL;
}
+ if (tb[NL80211_TXRATE_HE] &&
+ !he_set_mcs_mask(info, wdev, sband,
+ nla_data(tb[NL80211_TXRATE_HE]),
+ mask->control[band].he_mcs))
+ return -EINVAL;
+
+ if (tb[NL80211_TXRATE_HE_GI])
+ mask->control[band].he_gi =
+ nla_get_u8(tb[NL80211_TXRATE_HE_GI]);
+ if (tb[NL80211_TXRATE_HE_LTF])
+ mask->control[band].he_ltf =
+ nla_get_u8(tb[NL80211_TXRATE_HE_LTF]);
if (mask->control[band].legacy == 0) {
- /* don't allow empty legacy rates if HT or VHT
+ /* don't allow empty legacy rates if HT, VHT or HE
* are not even supported.
*/
if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported ||
- rdev->wiphy.bands[band]->vht_cap.vht_supported))
+ rdev->wiphy.bands[band]->vht_cap.vht_supported ||
+ ieee80211_get_he_iftype_cap(sband, wdev->iftype)))
return -EINVAL;
for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
@@ -4524,6 +4777,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
if (mask->control[band].vht_mcs[i])
goto out;
+ for (i = 0; i < NL80211_HE_NSS_MAX; i++)
+ if (mask->control[band].he_mcs[i])
+ goto out;
+
/* legacy and mcs rates may not be both empty */
return -EINVAL;
}
@@ -4537,7 +4794,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
enum nl80211_band band,
struct cfg80211_bitrate_mask *beacon_rate)
{
- u32 count_ht, count_vht, i;
+ u32 count_ht, count_vht, count_he, i;
u32 rate = beacon_rate->control[band].legacy;
/* Allow only one rate */
@@ -4570,7 +4827,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
return -EINVAL;
}
- if ((count_ht && count_vht) || (!rate && !count_ht && !count_vht))
+ count_he = 0;
+ for (i = 0; i < NL80211_HE_NSS_MAX; i++) {
+ if (hweight16(beacon_rate->control[band].he_mcs[i]) > 1) {
+ return -EINVAL;
+ } else if (beacon_rate->control[band].he_mcs[i]) {
+ count_he++;
+ if (count_he > 1)
+ return -EINVAL;
+ }
+ if (count_he && rate)
+ return -EINVAL;
+ }
+
+ if ((count_ht && count_vht && count_he) ||
+ (!rate && !count_ht && !count_vht && !count_he))
return -EINVAL;
if (rate &&
@@ -4585,6 +4856,10 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
!wiphy_ext_feature_isset(&rdev->wiphy,
NL80211_EXT_FEATURE_BEACON_RATE_VHT))
return -EINVAL;
+ if (count_he &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_BEACON_RATE_HE))
+ return -EINVAL;
return 0;
}
@@ -4683,18 +4958,34 @@ static int nl80211_parse_he_obss_pd(struct nlattr *attrs,
if (err)
return err;
- if (!tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET] ||
- !tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET])
+ if (!tb[NL80211_HE_OBSS_PD_ATTR_SR_CTRL])
return -EINVAL;
- he_obss_pd->min_offset =
- nla_get_u32(tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]);
- he_obss_pd->max_offset =
- nla_get_u32(tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]);
+ he_obss_pd->sr_ctrl = nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_SR_CTRL]);
+
+ if (tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET])
+ he_obss_pd->min_offset =
+ nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]);
+ if (tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET])
+ he_obss_pd->max_offset =
+ nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]);
+ if (tb[NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET])
+ he_obss_pd->non_srg_max_offset =
+ nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET]);
- if (he_obss_pd->min_offset >= he_obss_pd->max_offset)
+ if (he_obss_pd->min_offset > he_obss_pd->max_offset)
return -EINVAL;
+ if (tb[NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP])
+ memcpy(he_obss_pd->bss_color_bitmap,
+ nla_data(tb[NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP]),
+ sizeof(he_obss_pd->bss_color_bitmap));
+
+ if (tb[NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP])
+ memcpy(he_obss_pd->partial_bssid_bitmap,
+ nla_data(tb[NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP]),
+ sizeof(he_obss_pd->partial_bssid_bitmap));
+
he_obss_pd->enable = true;
return 0;
@@ -4724,6 +5015,65 @@ static int nl80211_parse_he_bss_color(struct nlattr *attrs,
return 0;
}
+static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev,
+ struct nlattr *attrs,
+ struct cfg80211_ap_settings *params)
+{
+ struct nlattr *tb[NL80211_FILS_DISCOVERY_ATTR_MAX + 1];
+ int ret;
+ struct cfg80211_fils_discovery *fd = &params->fils_discovery;
+
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_FILS_DISCOVERY))
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, NL80211_FILS_DISCOVERY_ATTR_MAX, attrs,
+ NULL, NULL);
+ if (ret)
+ return ret;
+
+ if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] ||
+ !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] ||
+ !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL])
+ return -EINVAL;
+
+ fd->tmpl_len = nla_len(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]);
+ fd->tmpl = nla_data(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]);
+ fd->min_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN]);
+ fd->max_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX]);
+
+ return 0;
+}
+
+static int
+nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev,
+ struct nlattr *attrs,
+ struct cfg80211_ap_settings *params)
+{
+ struct nlattr *tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1];
+ int ret;
+ struct cfg80211_unsol_bcast_probe_resp *presp =
+ &params->unsol_bcast_probe_resp;
+
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP))
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX,
+ attrs, NULL, NULL);
+ if (ret)
+ return ret;
+
+ if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] ||
+ !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL])
+ return -EINVAL;
+
+ presp->tmpl = nla_data(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]);
+ presp->tmpl_len = nla_len(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]);
+ presp->interval = nla_get_u32(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT]);
+ return 0;
+}
+
static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
const u8 *rates)
{
@@ -4739,6 +5089,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
params->vht_required = true;
if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY)
params->he_required = true;
+ if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_SAE_H2E)
+ params->sae_h2e_required = true;
}
}
@@ -4834,8 +5186,9 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
return false;
return true;
case NL80211_CMD_START_AP:
- /* SAE not supported yet */
- if (auth_type == NL80211_AUTHTYPE_SAE)
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_SAE_OFFLOAD_AP) &&
+ auth_type == NL80211_AUTHTYPE_SAE)
return false;
/* FILS not supported yet */
if (auth_type == NL80211_AUTHTYPE_FILS_SK ||
@@ -4899,8 +5252,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
params.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
params.ssid_len =
nla_len(info->attrs[NL80211_ATTR_SSID]);
- if (params.ssid_len == 0 ||
- params.ssid_len > IEEE80211_MAX_SSID_LEN)
+ if (params.ssid_len == 0)
return -EINVAL;
}
@@ -4969,7 +5321,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_TX_RATES]) {
err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
NL80211_ATTR_TX_RATES,
- &params.beacon_rate);
+ &params.beacon_rate,
+ dev, false);
if (err)
return err;
@@ -5031,6 +5384,22 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
goto out;
}
+ if (info->attrs[NL80211_ATTR_FILS_DISCOVERY]) {
+ err = nl80211_parse_fils_discovery(rdev,
+ info->attrs[NL80211_ATTR_FILS_DISCOVERY],
+ &params);
+ if (err)
+ goto out;
+ }
+
+ if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) {
+ err = nl80211_parse_unsol_bcast_probe_resp(
+ rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP],
+ &params);
+ if (err)
+ return err;
+ }
+
nl80211_calculate_ap_params(&params);
if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])
@@ -5840,11 +6209,9 @@ static int nl80211_parse_sta_channel_info(struct genl_info *info,
nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]);
/*
* Need to include at least one (first channel, number of
- * channels) tuple for each subband, and must have proper
- * tuples for the rest of the data as well.
+ * channels) tuple for each subband (checked in policy),
+ * and must have proper tuples for the rest of the data as well.
*/
- if (params->supported_channels_len < 2)
- return -EINVAL;
if (params->supported_channels_len % 2)
return -EINVAL;
}
@@ -5854,13 +6221,6 @@ static int nl80211_parse_sta_channel_info(struct genl_info *info,
nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]);
params->supported_oper_classes_len =
nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]);
- /*
- * The value of the Length field of the Supported Operating
- * Classes element is between 2 and 253.
- */
- if (params->supported_oper_classes_len < 2 ||
- params->supported_oper_classes_len > 253)
- return -EINVAL;
}
return 0;
}
@@ -5883,9 +6243,6 @@ static int nl80211_set_station_tdls(struct genl_info *info,
nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
params->he_capa_len =
nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
-
- if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
- return -EINVAL;
}
err = nl80211_parse_sta_channel_info(info, params);
@@ -6144,10 +6501,6 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
params.he_capa_len =
nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
-
- /* max len is validated in nla policy */
- if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
- return -EINVAL;
}
if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
@@ -7962,12 +8315,6 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
}
if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) {
- if (!wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_SET_SCAN_DWELL)) {
- err = -EOPNOTSUPP;
- goto out_free;
- }
-
request->duration =
nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]);
request->duration_mandatory =
@@ -8006,7 +8353,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
request->scan_start = jiffies;
rdev->scan_req = request;
- err = rdev_scan(rdev, request);
+ err = cfg80211_scan(rdev);
if (err)
goto out_free;
@@ -8419,23 +8766,14 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
}
if (ssid) {
- if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) {
- err = -EINVAL;
- goto out_free;
- }
memcpy(request->match_sets[i].ssid.ssid,
nla_data(ssid), nla_len(ssid));
request->match_sets[i].ssid.ssid_len =
nla_len(ssid);
}
- if (bssid) {
- if (nla_len(bssid) != ETH_ALEN) {
- err = -EINVAL;
- goto out_free;
- }
+ if (bssid)
memcpy(request->match_sets[i].bssid,
nla_data(bssid), ETH_ALEN);
- }
/* special attribute - old implementation w/a */
request->match_sets[i].rssi_thold = default_match_rssi;
@@ -8790,10 +9128,10 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
- if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON])
+ if (!csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON])
return -EINVAL;
- len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]);
+ len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]);
if (!len || (len % sizeof(u16)))
return -EINVAL;
@@ -8804,7 +9142,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
params.counter_offsets_beacon =
- nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]);
+ nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]);
/* sanity checks - counters should fit and be the same */
for (i = 0; i < params.n_counter_offsets_beacon; i++) {
@@ -8817,8 +9155,8 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
- if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) {
- len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]);
+ if (csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]) {
+ len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]);
if (!len || (len % sizeof(u16)))
return -EINVAL;
@@ -8829,7 +9167,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
params.counter_offsets_presp =
- nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]);
+ nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]);
/* sanity checks - counters should fit and be the same */
for (i = 0; i < params.n_counter_offsets_presp; i++) {
@@ -9094,6 +9432,11 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
survey->channel->center_freq))
goto nla_put_failure;
+ if (survey->channel && survey->channel->freq_offset &&
+ nla_put_u32(msg, NL80211_SURVEY_INFO_FREQUENCY_OFFSET,
+ survey->channel->freq_offset))
+ goto nla_put_failure;
+
if ((survey->filled & SURVEY_INFO_NOISE_DBM) &&
nla_put_u8(msg, NL80211_SURVEY_INFO_NOISE, survey->noise))
goto nla_put_failure;
@@ -9312,9 +9655,6 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]);
auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]);
- /* need to include at least Auth Transaction and Status Code */
- if (auth_data_len < 4)
- return -EINVAL;
}
local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
@@ -9454,7 +9794,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) {
if (!wiphy_ext_feature_isset(&rdev->wiphy,
- NL80211_EXT_FEATURE_SAE_OFFLOAD))
+ NL80211_EXT_FEATURE_SAE_OFFLOAD) &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_SAE_OFFLOAD_AP))
return -EINVAL;
settings->sae_pwd =
nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
@@ -9462,6 +9804,12 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
}
+ if (info->attrs[NL80211_ATTR_SAE_PWE])
+ settings->sae_pwe =
+ nla_get_u8(info->attrs[NL80211_ATTR_SAE_PWE]);
+ else
+ settings->sae_pwe = NL80211_SAE_PWE_UNSPECIFIED;
+
return 0;
}
@@ -9572,6 +9920,22 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]);
}
+ if (info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK]) {
+ if (!info->attrs[NL80211_ATTR_S1G_CAPABILITY])
+ return -EINVAL;
+ memcpy(&req.s1g_capa_mask,
+ nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK]),
+ sizeof(req.s1g_capa_mask));
+ }
+
+ if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) {
+ if (!info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK])
+ return -EINVAL;
+ memcpy(&req.s1g_capa,
+ nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]),
+ sizeof(req.s1g_capa));
+ }
+
err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
if (!err) {
wdev_lock(dev->ieee80211_ptr);
@@ -10801,7 +11165,8 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
return -EOPNOTSUPP;
err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
- NL80211_ATTR_TX_RATES, &mask);
+ NL80211_ATTR_TX_RATES, &mask,
+ dev, true);
if (err)
return err;
@@ -10878,6 +11243,7 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
case NL80211_IFTYPE_P2P_DEVICE:
if (!info->attrs[NL80211_ATTR_WIPHY_FREQ])
return -EINVAL;
+ break;
case NL80211_IFTYPE_STATION:
case NL80211_IFTYPE_ADHOC:
case NL80211_IFTYPE_P2P_CLIENT:
@@ -11409,7 +11775,8 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_TX_RATES]) {
err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
NL80211_ATTR_TX_RATES,
- &setup.beacon_rate);
+ &setup.beacon_rate,
+ dev, false);
if (err)
return err;
@@ -12346,7 +12713,7 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
struct net_device *dev = info->user_ptr[1];
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct nlattr *tb[NUM_NL80211_REKEY_DATA];
- struct cfg80211_gtk_rekey_data rekey_data;
+ struct cfg80211_gtk_rekey_data rekey_data = {};
int err;
if (!info->attrs[NL80211_ATTR_REKEY_DATA])
@@ -12361,8 +12728,6 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] ||
!tb[NL80211_REKEY_DATA_KCK])
return -EINVAL;
- if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN)
- return -ERANGE;
if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN &&
!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK &&
nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KEK_EXT_LEN))
@@ -12687,8 +13052,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
func->cookie = cfg80211_assign_cookie(rdev);
- if (!tb[NL80211_NAN_FUNC_TYPE] ||
- nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) {
+ if (!tb[NL80211_NAN_FUNC_TYPE]) {
err = -EINVAL;
goto out;
}
@@ -13178,9 +13542,6 @@ static int nl80211_crit_protocol_start(struct sk_buff *skb,
duration =
nla_get_u16(info->attrs[NL80211_ATTR_MAX_CRIT_PROT_DURATION]);
- if (duration > NL80211_CRIT_PROTO_MAX_DURATION)
- return -ERANGE;
-
ret = rdev_crit_proto_start(rdev, wdev, proto, duration);
if (!ret)
rdev->crit_proto_nlportid = info->snd_portid;
@@ -13565,8 +13926,7 @@ static int nl80211_set_qos_map(struct sk_buff *skb,
pos = nla_data(info->attrs[NL80211_ATTR_QOS_MAP]);
len = nla_len(info->attrs[NL80211_ATTR_QOS_MAP]);
- if (len % 2 || len < IEEE80211_QOS_MAP_LEN_MIN ||
- len > IEEE80211_QOS_MAP_LEN_MAX)
+ if (len % 2)
return -EINVAL;
qos_map = kzalloc(sizeof(struct cfg80211_qos_map), GFP_KERNEL);
@@ -13834,17 +14194,9 @@ static int nl80211_set_pmk(struct sk_buff *skb, struct genl_info *info)
goto out;
}
- if (info->attrs[NL80211_ATTR_PMKR0_NAME]) {
- int r0_name_len = nla_len(info->attrs[NL80211_ATTR_PMKR0_NAME]);
-
- if (r0_name_len != WLAN_PMK_NAME_LEN) {
- ret = -EINVAL;
- goto out;
- }
-
+ if (info->attrs[NL80211_ATTR_PMKR0_NAME])
pmk_conf.pmk_r0_name =
nla_data(info->attrs[NL80211_ATTR_PMKR0_NAME]);
- }
ret = rdev_set_pmk(rdev, dev, &pmk_conf);
out:
@@ -13903,8 +14255,7 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_SSID]) {
params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
- if (params.ssid.ssid_len == 0 ||
- params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN)
+ if (params.ssid.ssid_len == 0)
return -EINVAL;
memcpy(params.ssid.ssid,
nla_data(info->attrs[NL80211_ATTR_SSID]),
@@ -14205,7 +14556,8 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev,
if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) {
attr = NL80211_TID_CONFIG_ATTR_TX_RATE;
err = nl80211_parse_tx_bitrate_mask(info, attrs, attr,
- &tid_conf->txrate_mask);
+ &tid_conf->txrate_mask, dev,
+ true);
if (err)
return err;
@@ -14386,6 +14738,111 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
}
}
+static int nl80211_set_sar_sub_specs(struct cfg80211_registered_device *rdev,
+ struct cfg80211_sar_specs *sar_specs,
+ struct nlattr *spec[], int index)
+{
+ u32 range_index, i;
+
+ if (!sar_specs || !spec)
+ return -EINVAL;
+
+ if (!spec[NL80211_SAR_ATTR_SPECS_POWER] ||
+ !spec[NL80211_SAR_ATTR_SPECS_RANGE_INDEX])
+ return -EINVAL;
+
+ range_index = nla_get_u32(spec[NL80211_SAR_ATTR_SPECS_RANGE_INDEX]);
+
+ /* check if range_index exceeds num_freq_ranges */
+ if (range_index >= rdev->wiphy.sar_capa->num_freq_ranges)
+ return -EINVAL;
+
+ /* check if range_index duplicates */
+ for (i = 0; i < index; i++) {
+ if (sar_specs->sub_specs[i].freq_range_index == range_index)
+ return -EINVAL;
+ }
+
+ sar_specs->sub_specs[index].power =
+ nla_get_s32(spec[NL80211_SAR_ATTR_SPECS_POWER]);
+
+ sar_specs->sub_specs[index].freq_range_index = range_index;
+
+ return 0;
+}
+
+static int nl80211_set_sar_specs(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct nlattr *spec[NL80211_SAR_ATTR_SPECS_MAX + 1];
+ struct nlattr *tb[NL80211_SAR_ATTR_MAX + 1];
+ struct cfg80211_sar_specs *sar_spec;
+ enum nl80211_sar_type type;
+ struct nlattr *spec_list;
+ u32 specs;
+ int rem, err;
+
+ if (!rdev->wiphy.sar_capa || !rdev->ops->set_sar_specs)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL80211_ATTR_SAR_SPEC])
+ return -EINVAL;
+
+ nla_parse_nested(tb, NL80211_SAR_ATTR_MAX,
+ info->attrs[NL80211_ATTR_SAR_SPEC],
+ NULL, NULL);
+
+ if (!tb[NL80211_SAR_ATTR_TYPE] || !tb[NL80211_SAR_ATTR_SPECS])
+ return -EINVAL;
+
+ type = nla_get_u32(tb[NL80211_SAR_ATTR_TYPE]);
+ if (type != rdev->wiphy.sar_capa->type)
+ return -EINVAL;
+
+ specs = 0;
+ nla_for_each_nested(spec_list, tb[NL80211_SAR_ATTR_SPECS], rem)
+ specs++;
+
+ if (specs > rdev->wiphy.sar_capa->num_freq_ranges)
+ return -EINVAL;
+
+ sar_spec = kzalloc(sizeof(*sar_spec) +
+ specs * sizeof(struct cfg80211_sar_sub_specs),
+ GFP_KERNEL);
+ if (!sar_spec)
+ return -ENOMEM;
+
+ sar_spec->type = type;
+ specs = 0;
+ nla_for_each_nested(spec_list, tb[NL80211_SAR_ATTR_SPECS], rem) {
+ nla_parse_nested(spec, NL80211_SAR_ATTR_SPECS_MAX,
+ spec_list, NULL, NULL);
+
+ switch (type) {
+ case NL80211_SAR_TYPE_POWER:
+ if (nl80211_set_sar_sub_specs(rdev, sar_spec,
+ spec, specs)) {
+ err = -EINVAL;
+ goto error;
+ }
+ break;
+ default:
+ err = -EINVAL;
+ goto error;
+ }
+ specs++;
+ }
+
+ sar_spec->num_sub_specs = specs;
+
+ rdev->cur_cmd_info = info;
+ err = rdev_set_sar_specs(rdev, sar_spec);
+ rdev->cur_cmd_info = NULL;
+error:
+ kfree(sar_spec);
+ return err;
+}
+
static const struct genl_ops nl80211_ops[] = {
{
.cmd = NL80211_CMD_GET_WIPHY,
@@ -14397,6 +14854,9 @@ static const struct genl_ops nl80211_ops[] = {
.internal_flags = NL80211_FLAG_NEED_WIPHY |
NL80211_FLAG_NEED_RTNL,
},
+};
+
+static const struct genl_small_ops nl80211_small_ops[] = {
{
.cmd = NL80211_CMD_SET_WIPHY,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -14865,14 +15325,6 @@ static const struct genl_ops nl80211_ops[] = {
NL80211_FLAG_NEED_RTNL,
},
{
- .cmd = NL80211_CMD_SET_WDS_PEER,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .doit = nl80211_set_wds_peer,
- .flags = GENL_UNS_ADMIN_PERM,
- .internal_flags = NL80211_FLAG_NEED_NETDEV |
- NL80211_FLAG_NEED_RTNL,
- },
- {
.cmd = NL80211_CMD_JOIN_MESH,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl80211_join_mesh,
@@ -15244,6 +15696,14 @@ static const struct genl_ops nl80211_ops[] = {
.internal_flags = NL80211_FLAG_NEED_NETDEV |
NL80211_FLAG_NEED_RTNL,
},
+ {
+ .cmd = NL80211_CMD_SET_SAR_SPECS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl80211_set_sar_specs,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
};
static struct genl_family nl80211_fam __ro_after_init = {
@@ -15258,6 +15718,8 @@ static struct genl_family nl80211_fam __ro_after_init = {
.module = THIS_MODULE,
.ops = nl80211_ops,
.n_ops = ARRAY_SIZE(nl80211_ops),
+ .small_ops = nl80211_small_ops,
+ .n_small_ops = ARRAY_SIZE(nl80211_small_ops),
.mcgrps = nl80211_mcgrps,
.n_mcgrps = ARRAY_SIZE(nl80211_mcgrps),
.parallel_ops = true,
@@ -15312,6 +15774,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
struct cfg80211_scan_request *req = rdev->scan_req;
struct nlattr *nest;
int i;
+ struct cfg80211_scan_info *info;
if (WARN_ON(!req))
return 0;
@@ -15355,11 +15818,13 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags))
goto nla_put_failure;
- if (req->info.scan_start_tsf &&
+ info = rdev->int_scan_req ? &rdev->int_scan_req->info :
+ &rdev->scan_req->info;
+ if (info->scan_start_tsf &&
(nla_put_u64_64bit(msg, NL80211_ATTR_SCAN_START_TIME_TSF,
- req->info.scan_start_tsf, NL80211_BSS_PAD) ||
+ info->scan_start_tsf, NL80211_BSS_PAD) ||
nla_put(msg, NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, ETH_ALEN,
- req->info.tsf_bssid)))
+ info->tsf_bssid)))
goto nla_put_failure;
return 0;
@@ -15574,7 +16039,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
const u8 *buf, size_t len,
enum nl80211_commands cmd, gfp_t gfp,
int uapsd_queues, const u8 *req_ies,
- size_t req_ies_len)
+ size_t req_ies_len, bool reconnect)
{
struct sk_buff *msg;
void *hdr;
@@ -15596,6 +16061,9 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
nla_put(msg, NL80211_ATTR_REQ_IE, req_ies_len, req_ies)))
goto nla_put_failure;
+ if (reconnect && nla_put_flag(msg, NL80211_ATTR_RECONNECT_REQUESTED))
+ goto nla_put_failure;
+
if (uapsd_queues >= 0) {
struct nlattr *nla_wmm =
nla_nest_start_noflag(msg, NL80211_ATTR_STA_WME);
@@ -15624,7 +16092,8 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
size_t len, gfp_t gfp)
{
nl80211_send_mlme_event(rdev, netdev, buf, len,
- NL80211_CMD_AUTHENTICATE, gfp, -1, NULL, 0);
+ NL80211_CMD_AUTHENTICATE, gfp, -1, NULL, 0,
+ false);
}
void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
@@ -15634,23 +16103,25 @@ void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
{
nl80211_send_mlme_event(rdev, netdev, buf, len,
NL80211_CMD_ASSOCIATE, gfp, uapsd_queues,
- req_ies, req_ies_len);
+ req_ies, req_ies_len, false);
}
void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
struct net_device *netdev, const u8 *buf,
- size_t len, gfp_t gfp)
+ size_t len, bool reconnect, gfp_t gfp)
{
nl80211_send_mlme_event(rdev, netdev, buf, len,
- NL80211_CMD_DEAUTHENTICATE, gfp, -1, NULL, 0);
+ NL80211_CMD_DEAUTHENTICATE, gfp, -1, NULL, 0,
+ reconnect);
}
void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
struct net_device *netdev, const u8 *buf,
- size_t len, gfp_t gfp)
+ size_t len, bool reconnect, gfp_t gfp)
{
nl80211_send_mlme_event(rdev, netdev, buf, len,
- NL80211_CMD_DISASSOCIATE, gfp, -1, NULL, 0);
+ NL80211_CMD_DISASSOCIATE, gfp, -1, NULL, 0,
+ reconnect);
}
void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf,
@@ -15681,7 +16152,7 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf,
trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len);
nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1,
- NULL, 0);
+ NULL, 0, false);
}
EXPORT_SYMBOL(cfg80211_rx_unprot_mlme_mgmt);
@@ -16782,7 +17253,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
struct cfg80211_chan_def *chandef,
gfp_t gfp,
enum nl80211_commands notif,
- u8 count)
+ u8 count, bool quiet)
{
struct sk_buff *msg;
void *hdr;
@@ -16803,9 +17274,13 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
if (nl80211_send_chandef(msg, chandef))
goto nla_put_failure;
- if ((notif == NL80211_CMD_CH_SWITCH_STARTED_NOTIFY) &&
- (nla_put_u32(msg, NL80211_ATTR_CH_SWITCH_COUNT, count)))
+ if (notif == NL80211_CMD_CH_SWITCH_STARTED_NOTIFY) {
+ if (nla_put_u32(msg, NL80211_ATTR_CH_SWITCH_COUNT, count))
goto nla_put_failure;
+ if (quiet &&
+ nla_put_flag(msg, NL80211_ATTR_CH_SWITCH_BLOCK_TX))
+ goto nla_put_failure;
+ }
genlmsg_end(msg, hdr);
@@ -16838,13 +17313,13 @@ void cfg80211_ch_switch_notify(struct net_device *dev,
cfg80211_sched_dfs_chan_update(rdev);
nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL,
- NL80211_CMD_CH_SWITCH_NOTIFY, 0);
+ NL80211_CMD_CH_SWITCH_NOTIFY, 0, false);
}
EXPORT_SYMBOL(cfg80211_ch_switch_notify);
void cfg80211_ch_switch_started_notify(struct net_device *dev,
struct cfg80211_chan_def *chandef,
- u8 count)
+ u8 count, bool quiet)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct wiphy *wiphy = wdev->wiphy;
@@ -16853,7 +17328,8 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev,
trace_cfg80211_ch_switch_started_notify(dev, chandef);
nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL,
- NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, count);
+ NL80211_CMD_CH_SWITCH_STARTED_NOTIFY,
+ count, quiet);
}
EXPORT_SYMBOL(cfg80211_ch_switch_started_notify);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index d3e8e426c486..a3f387770f1b 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Portions of this file
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018, 2020 Intel Corporation
*/
#ifndef __NET_WIRELESS_NL80211_H
#define __NET_WIRELESS_NL80211_H
@@ -69,10 +69,12 @@ void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
const u8 *req_ies, size_t req_ies_len);
void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
struct net_device *netdev,
- const u8 *buf, size_t len, gfp_t gfp);
+ const u8 *buf, size_t len,
+ bool reconnect, gfp_t gfp);
void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
struct net_device *netdev,
- const u8 *buf, size_t len, gfp_t gfp);
+ const u8 *buf, size_t len,
+ bool reconnect, gfp_t gfp);
void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev,
struct net_device *netdev,
const u8 *addr, gfp_t gfp);
diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c
index d5e28239e030..36f1b59a78bf 100644
--- a/net/wireless/radiotap.c
+++ b/net/wireless/radiotap.c
@@ -59,6 +59,7 @@ static const struct ieee80211_radiotap_namespace radiotap_ns = {
* @iterator: radiotap_iterator to initialize
* @radiotap_header: radiotap header to parse
* @max_length: total length we can parse into (eg, whole packet length)
+ * @vns: vendor namespaces to parse
*
* Returns: 0 or a negative error code if there is a problem.
*
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 950d57494168..8b1358d04ca2 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -582,16 +582,6 @@ static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev,
return ret;
}
-static inline int rdev_set_wds_peer(struct cfg80211_registered_device *rdev,
- struct net_device *dev, const u8 *addr)
-{
- int ret;
- trace_rdev_set_wds_peer(&rdev->wiphy, dev, addr);
- ret = rdev->ops->set_wds_peer(&rdev->wiphy, dev, addr);
- trace_rdev_return_int(&rdev->wiphy, ret);
- return ret;
-}
-
static inline int
rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
struct net_device *dev,
@@ -1356,4 +1346,16 @@ static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev,
+ struct cfg80211_sar_specs *sar)
+{
+ int ret;
+
+ trace_rdev_set_sar_specs(&rdev->wiphy, sar);
+ ret = rdev->ops->set_sar_specs(&rdev->wiphy, sar);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+
+ return ret;
+}
+
#endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index d8a90d397423..8114bba8556c 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -5,7 +5,7 @@
* Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2021 Intel Corporation
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -139,6 +139,11 @@ static const struct ieee80211_regdomain *get_cfg80211_regdom(void)
return rcu_dereference_rtnl(cfg80211_regdomain);
}
+/*
+ * Returns the regulatory domain associated with the wiphy.
+ *
+ * Requires either RTNL or RCU protection
+ */
const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy)
{
return rcu_dereference_rtnl(wiphy->regd);
@@ -1594,7 +1599,7 @@ freq_reg_info_regd(u32 center_freq,
/*
* We only need to know if one frequency rule was
- * was in center_freq's band, that's enough, so lets
+ * in center_freq's band, that's enough, so let's
* not overwrite it once found
*/
if (!band_rule_found)
@@ -1616,10 +1621,12 @@ static const struct ieee80211_reg_rule *
__freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw)
{
const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy);
- const struct ieee80211_reg_rule *reg_rule = NULL;
+ static const u32 bws[] = {0, 1, 2, 4, 5, 8, 10, 16, 20};
+ const struct ieee80211_reg_rule *reg_rule;
+ int i = ARRAY_SIZE(bws) - 1;
u32 bw;
- for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) {
+ for (bw = MHZ_TO_KHZ(bws[i]); bw >= min_bw; bw = MHZ_TO_KHZ(bws[i--])) {
reg_rule = freq_reg_info_regd(center_freq, regd, bw);
if (!IS_ERR(reg_rule))
return reg_rule;
@@ -1631,7 +1638,9 @@ __freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw)
const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy,
u32 center_freq)
{
- return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(20));
+ u32 min_bw = center_freq < MHZ_TO_KHZ(1000) ? 1 : 20;
+
+ return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(min_bw));
}
EXPORT_SYMBOL(freq_reg_info);
@@ -1659,6 +1668,7 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
{
const struct ieee80211_freq_range *freq_range = NULL;
u32 max_bandwidth_khz, center_freq_khz, bw_flags = 0;
+ bool is_s1g = chan->band == NL80211_BAND_S1GHZ;
freq_range = &reg_rule->freq_range;
@@ -1678,70 +1688,72 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
MHZ_TO_KHZ(20)))
bw_flags |= IEEE80211_CHAN_NO_20MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(10))
- bw_flags |= IEEE80211_CHAN_NO_10MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(20))
- bw_flags |= IEEE80211_CHAN_NO_20MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(40))
- bw_flags |= IEEE80211_CHAN_NO_HT40;
- if (max_bandwidth_khz < MHZ_TO_KHZ(80))
- bw_flags |= IEEE80211_CHAN_NO_80MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(160))
- bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+ if (is_s1g) {
+ /* S1G is strict about non overlapping channels. We can
+ * calculate which bandwidth is allowed per channel by finding
+ * the largest bandwidth which cleanly divides the freq_range.
+ */
+ int edge_offset;
+ int ch_bw = max_bandwidth_khz;
+
+ while (ch_bw) {
+ edge_offset = (center_freq_khz - ch_bw / 2) -
+ freq_range->start_freq_khz;
+ if (edge_offset % ch_bw == 0) {
+ switch (KHZ_TO_MHZ(ch_bw)) {
+ case 1:
+ bw_flags |= IEEE80211_CHAN_1MHZ;
+ break;
+ case 2:
+ bw_flags |= IEEE80211_CHAN_2MHZ;
+ break;
+ case 4:
+ bw_flags |= IEEE80211_CHAN_4MHZ;
+ break;
+ case 8:
+ bw_flags |= IEEE80211_CHAN_8MHZ;
+ break;
+ case 16:
+ bw_flags |= IEEE80211_CHAN_16MHZ;
+ break;
+ default:
+ /* If we got here, no bandwidths fit on
+ * this frequency, ie. band edge.
+ */
+ bw_flags |= IEEE80211_CHAN_DISABLED;
+ break;
+ }
+ break;
+ }
+ ch_bw /= 2;
+ }
+ } else {
+ if (max_bandwidth_khz < MHZ_TO_KHZ(10))
+ bw_flags |= IEEE80211_CHAN_NO_10MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(20))
+ bw_flags |= IEEE80211_CHAN_NO_20MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(40))
+ bw_flags |= IEEE80211_CHAN_NO_HT40;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(80))
+ bw_flags |= IEEE80211_CHAN_NO_80MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(160))
+ bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+ }
return bw_flags;
}
-/*
- * Note that right now we assume the desired channel bandwidth
- * is always 20 MHz for each individual channel (HT40 uses 20 MHz
- * per channel, the primary and the extension channel).
- */
-static void handle_channel(struct wiphy *wiphy,
- enum nl80211_reg_initiator initiator,
- struct ieee80211_channel *chan)
+static void handle_channel_single_rule(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator,
+ struct ieee80211_channel *chan,
+ u32 flags,
+ struct regulatory_request *lr,
+ struct wiphy *request_wiphy,
+ const struct ieee80211_reg_rule *reg_rule)
{
- u32 flags, bw_flags = 0;
- const struct ieee80211_reg_rule *reg_rule = NULL;
+ u32 bw_flags = 0;
const struct ieee80211_power_rule *power_rule = NULL;
- struct wiphy *request_wiphy = NULL;
- struct regulatory_request *lr = get_last_request();
const struct ieee80211_regdomain *regd;
- request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);
-
- flags = chan->orig_flags;
-
- reg_rule = freq_reg_info(wiphy, ieee80211_channel_to_khz(chan));
- if (IS_ERR(reg_rule)) {
- /*
- * We will disable all channels that do not match our
- * received regulatory rule unless the hint is coming
- * from a Country IE and the Country IE had no information
- * about a band. The IEEE 802.11 spec allows for an AP
- * to send only a subset of the regulatory rules allowed,
- * so an AP in the US that only supports 2.4 GHz may only send
- * a country IE with information for the 2.4 GHz band
- * while 5 GHz is still supported.
- */
- if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
- PTR_ERR(reg_rule) == -ERANGE)
- return;
-
- if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
- request_wiphy && request_wiphy == wiphy &&
- request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
- pr_debug("Disabling freq %d.%03d MHz for good\n",
- chan->center_freq, chan->freq_offset);
- chan->orig_flags |= IEEE80211_CHAN_DISABLED;
- chan->flags = chan->orig_flags;
- } else {
- pr_debug("Disabling freq %d.%03d MHz\n",
- chan->center_freq, chan->freq_offset);
- chan->flags |= IEEE80211_CHAN_DISABLED;
- }
- return;
- }
-
regd = reg_get_regdomain(wiphy);
power_rule = &reg_rule->power_rule;
@@ -1803,6 +1815,204 @@ static void handle_channel(struct wiphy *wiphy,
chan->max_power = chan->max_reg_power;
}
+static void handle_channel_adjacent_rules(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator,
+ struct ieee80211_channel *chan,
+ u32 flags,
+ struct regulatory_request *lr,
+ struct wiphy *request_wiphy,
+ const struct ieee80211_reg_rule *rrule1,
+ const struct ieee80211_reg_rule *rrule2,
+ struct ieee80211_freq_range *comb_range)
+{
+ u32 bw_flags1 = 0;
+ u32 bw_flags2 = 0;
+ const struct ieee80211_power_rule *power_rule1 = NULL;
+ const struct ieee80211_power_rule *power_rule2 = NULL;
+ const struct ieee80211_regdomain *regd;
+
+ regd = reg_get_regdomain(wiphy);
+
+ power_rule1 = &rrule1->power_rule;
+ power_rule2 = &rrule2->power_rule;
+ bw_flags1 = reg_rule_to_chan_bw_flags(regd, rrule1, chan);
+ bw_flags2 = reg_rule_to_chan_bw_flags(regd, rrule2, chan);
+
+ if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+ request_wiphy && request_wiphy == wiphy &&
+ request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
+ /* This guarantees the driver's requested regulatory domain
+ * will always be used as a base for further regulatory
+ * settings
+ */
+ chan->flags =
+ map_regdom_flags(rrule1->flags) |
+ map_regdom_flags(rrule2->flags) |
+ bw_flags1 |
+ bw_flags2;
+ chan->orig_flags = chan->flags;
+ chan->max_antenna_gain =
+ min_t(int, MBI_TO_DBI(power_rule1->max_antenna_gain),
+ MBI_TO_DBI(power_rule2->max_antenna_gain));
+ chan->orig_mag = chan->max_antenna_gain;
+ chan->max_reg_power =
+ min_t(int, MBM_TO_DBM(power_rule1->max_eirp),
+ MBM_TO_DBM(power_rule2->max_eirp));
+ chan->max_power = chan->max_reg_power;
+ chan->orig_mpwr = chan->max_reg_power;
+
+ if (chan->flags & IEEE80211_CHAN_RADAR) {
+ chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
+ if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms)
+ chan->dfs_cac_ms = max_t(unsigned int,
+ rrule1->dfs_cac_ms,
+ rrule2->dfs_cac_ms);
+ }
+
+ return;
+ }
+
+ chan->dfs_state = NL80211_DFS_USABLE;
+ chan->dfs_state_entered = jiffies;
+
+ chan->beacon_found = false;
+ chan->flags = flags | bw_flags1 | bw_flags2 |
+ map_regdom_flags(rrule1->flags) |
+ map_regdom_flags(rrule2->flags);
+
+ /* reg_rule_to_chan_bw_flags may forbids 10 and forbids 20 MHz
+ * (otherwise no adj. rule case), recheck therefore
+ */
+ if (cfg80211_does_bw_fit_range(comb_range,
+ ieee80211_channel_to_khz(chan),
+ MHZ_TO_KHZ(10)))
+ chan->flags &= ~IEEE80211_CHAN_NO_10MHZ;
+ if (cfg80211_does_bw_fit_range(comb_range,
+ ieee80211_channel_to_khz(chan),
+ MHZ_TO_KHZ(20)))
+ chan->flags &= ~IEEE80211_CHAN_NO_20MHZ;
+
+ chan->max_antenna_gain =
+ min_t(int, chan->orig_mag,
+ min_t(int,
+ MBI_TO_DBI(power_rule1->max_antenna_gain),
+ MBI_TO_DBI(power_rule2->max_antenna_gain)));
+ chan->max_reg_power = min_t(int,
+ MBM_TO_DBM(power_rule1->max_eirp),
+ MBM_TO_DBM(power_rule2->max_eirp));
+
+ if (chan->flags & IEEE80211_CHAN_RADAR) {
+ if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms)
+ chan->dfs_cac_ms = max_t(unsigned int,
+ rrule1->dfs_cac_ms,
+ rrule2->dfs_cac_ms);
+ else
+ chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
+ }
+
+ if (chan->orig_mpwr) {
+ /* Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER
+ * will always follow the passed country IE power settings.
+ */
+ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_FOLLOW_POWER)
+ chan->max_power = chan->max_reg_power;
+ else
+ chan->max_power = min(chan->orig_mpwr,
+ chan->max_reg_power);
+ } else {
+ chan->max_power = chan->max_reg_power;
+ }
+}
+
+/* Note that right now we assume the desired channel bandwidth
+ * is always 20 MHz for each individual channel (HT40 uses 20 MHz
+ * per channel, the primary and the extension channel).
+ */
+static void handle_channel(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator,
+ struct ieee80211_channel *chan)
+{
+ const u32 orig_chan_freq = ieee80211_channel_to_khz(chan);
+ struct regulatory_request *lr = get_last_request();
+ struct wiphy *request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);
+ const struct ieee80211_reg_rule *rrule = NULL;
+ const struct ieee80211_reg_rule *rrule1 = NULL;
+ const struct ieee80211_reg_rule *rrule2 = NULL;
+
+ u32 flags = chan->orig_flags;
+
+ rrule = freq_reg_info(wiphy, orig_chan_freq);
+ if (IS_ERR(rrule)) {
+ /* check for adjacent match, therefore get rules for
+ * chan - 20 MHz and chan + 20 MHz and test
+ * if reg rules are adjacent
+ */
+ rrule1 = freq_reg_info(wiphy,
+ orig_chan_freq - MHZ_TO_KHZ(20));
+ rrule2 = freq_reg_info(wiphy,
+ orig_chan_freq + MHZ_TO_KHZ(20));
+ if (!IS_ERR(rrule1) && !IS_ERR(rrule2)) {
+ struct ieee80211_freq_range comb_range;
+
+ if (rrule1->freq_range.end_freq_khz !=
+ rrule2->freq_range.start_freq_khz)
+ goto disable_chan;
+
+ comb_range.start_freq_khz =
+ rrule1->freq_range.start_freq_khz;
+ comb_range.end_freq_khz =
+ rrule2->freq_range.end_freq_khz;
+ comb_range.max_bandwidth_khz =
+ min_t(u32,
+ rrule1->freq_range.max_bandwidth_khz,
+ rrule2->freq_range.max_bandwidth_khz);
+
+ if (!cfg80211_does_bw_fit_range(&comb_range,
+ orig_chan_freq,
+ MHZ_TO_KHZ(20)))
+ goto disable_chan;
+
+ handle_channel_adjacent_rules(wiphy, initiator, chan,
+ flags, lr, request_wiphy,
+ rrule1, rrule2,
+ &comb_range);
+ return;
+ }
+
+disable_chan:
+ /* We will disable all channels that do not match our
+ * received regulatory rule unless the hint is coming
+ * from a Country IE and the Country IE had no information
+ * about a band. The IEEE 802.11 spec allows for an AP
+ * to send only a subset of the regulatory rules allowed,
+ * so an AP in the US that only supports 2.4 GHz may only send
+ * a country IE with information for the 2.4 GHz band
+ * while 5 GHz is still supported.
+ */
+ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ PTR_ERR(rrule) == -ERANGE)
+ return;
+
+ if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+ request_wiphy && request_wiphy == wiphy &&
+ request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
+ pr_debug("Disabling freq %d.%03d MHz for good\n",
+ chan->center_freq, chan->freq_offset);
+ chan->orig_flags |= IEEE80211_CHAN_DISABLED;
+ chan->flags = chan->orig_flags;
+ } else {
+ pr_debug("Disabling freq %d.%03d MHz\n",
+ chan->center_freq, chan->freq_offset);
+ chan->flags |= IEEE80211_CHAN_DISABLED;
+ }
+ return;
+ }
+
+ handle_channel_single_rule(wiphy, initiator, chan, flags, lr,
+ request_wiphy, rrule);
+}
+
static void handle_band(struct wiphy *wiphy,
enum nl80211_reg_initiator initiator,
struct ieee80211_supported_band *sband)
@@ -2342,6 +2552,7 @@ static void handle_band_custom(struct wiphy *wiphy,
void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
const struct ieee80211_regdomain *regd)
{
+ const struct ieee80211_regdomain *new_regd, *tmp;
enum nl80211_band band;
unsigned int bands_set = 0;
@@ -2361,6 +2572,17 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
* on your device's supported bands.
*/
WARN_ON(!bands_set);
+ new_regd = reg_copy_regd(regd);
+ if (IS_ERR(new_regd))
+ return;
+
+ rtnl_lock();
+
+ tmp = get_wiphy_regdom(wiphy);
+ rcu_assign_pointer(wiphy->regd, new_regd);
+ rcu_free_regdom(tmp);
+
+ rtnl_unlock();
}
EXPORT_SYMBOL(wiphy_apply_custom_regulatory);
@@ -3170,7 +3392,7 @@ static void restore_custom_reg_settings(struct wiphy *wiphy)
* - send a user regulatory hint if applicable
*
* Device drivers that send a regulatory hint for a specific country
- * keep their own regulatory domain on wiphy->regd so that does does
+ * keep their own regulatory domain on wiphy->regd so that does
* not need to be remembered.
*/
static void restore_regulatory_settings(bool reset_user, bool cached)
@@ -3411,7 +3633,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
power_rule = &reg_rule->power_rule;
if (reg_rule->flags & NL80211_RRF_AUTO_BW)
- snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO",
+ snprintf(bw, sizeof(bw), "%d KHz, %u KHz AUTO",
freq_range->max_bandwidth_khz,
reg_get_max_bandwidth(rd, reg_rule));
else
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 04f2d198c215..1b7fec3b53cd 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -5,7 +5,7 @@
* Copyright 2008 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2016 Intel Deutschland GmbH
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*/
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -14,6 +14,8 @@
#include <linux/wireless.h>
#include <linux/nl80211.h>
#include <linux/etherdevice.h>
+#include <linux/crc32.h>
+#include <linux/bitfield.h>
#include <net/arp.h>
#include <net/cfg80211.h>
#include <net/cfg80211-wext.h>
@@ -55,7 +57,7 @@
*
* Also note that the hidden_beacon_bss pointer is only relevant
* if the driver uses something other than the IEs, e.g. private
- * data stored stored in the BSS struct, since the beacon IEs are
+ * data stored in the BSS struct, since the beacon IEs are
* also linked into the probe response struct.
*/
@@ -74,6 +76,43 @@ MODULE_PARM_DESC(bss_entries_limit,
#define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ)
+/**
+ * struct cfg80211_colocated_ap - colocated AP information
+ *
+ * @list: linked list to all colocated aPS
+ * @bssid: BSSID of the reported AP
+ * @ssid: SSID of the reported AP
+ * @ssid_len: length of the ssid
+ * @center_freq: frequency the reported AP is on
+ * @unsolicited_probe: the reported AP is part of an ESS, where all the APs
+ * that operate in the same channel as the reported AP and that might be
+ * detected by a STA receiving this frame, are transmitting unsolicited
+ * Probe Response frames every 20 TUs
+ * @oct_recommended: OCT is recommended to exchange MMPDUs with the reported AP
+ * @same_ssid: the reported AP has the same SSID as the reporting AP
+ * @multi_bss: the reported AP is part of a multiple BSSID set
+ * @transmitted_bssid: the reported AP is the transmitting BSSID
+ * @colocated_ess: all the APs that share the same ESS as the reported AP are
+ * colocated and can be discovered via legacy bands.
+ * @short_ssid_valid: short_ssid is valid and can be used
+ * @short_ssid: the short SSID for this SSID
+ */
+struct cfg80211_colocated_ap {
+ struct list_head list;
+ u8 bssid[ETH_ALEN];
+ u8 ssid[IEEE80211_MAX_SSID_LEN];
+ size_t ssid_len;
+ u32 short_ssid;
+ u32 center_freq;
+ u8 unsolicited_probe:1,
+ oct_recommended:1,
+ same_ssid:1,
+ multi_bss:1,
+ transmitted_bssid:1,
+ colocated_ess:1,
+ short_ssid_valid:1;
+};
+
static void bss_free(struct cfg80211_internal_bss *bss)
{
struct cfg80211_bss_ies *ies;
@@ -448,10 +487,431 @@ static bool cfg80211_bss_expire_oldest(struct cfg80211_registered_device *rdev)
return ret;
}
+static u8 cfg80211_parse_bss_param(u8 data,
+ struct cfg80211_colocated_ap *coloc_ap)
+{
+ coloc_ap->oct_recommended =
+ u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED);
+ coloc_ap->same_ssid =
+ u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_SAME_SSID);
+ coloc_ap->multi_bss =
+ u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID);
+ coloc_ap->transmitted_bssid =
+ u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID);
+ coloc_ap->unsolicited_probe =
+ u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE);
+ coloc_ap->colocated_ess =
+ u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS);
+
+ return u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_AP);
+}
+
+static int cfg80211_calc_short_ssid(const struct cfg80211_bss_ies *ies,
+ const struct element **elem, u32 *s_ssid)
+{
+
+ *elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len);
+ if (!*elem || (*elem)->datalen > IEEE80211_MAX_SSID_LEN)
+ return -EINVAL;
+
+ *s_ssid = ~crc32_le(~0, (*elem)->data, (*elem)->datalen);
+ return 0;
+}
+
+static void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list)
+{
+ struct cfg80211_colocated_ap *ap, *tmp_ap;
+
+ list_for_each_entry_safe(ap, tmp_ap, coloc_ap_list, list) {
+ list_del(&ap->list);
+ kfree(ap);
+ }
+}
+
+static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry,
+ const u8 *pos, u8 length,
+ const struct element *ssid_elem,
+ int s_ssid_tmp)
+{
+ /* skip the TBTT offset */
+ pos++;
+
+ memcpy(entry->bssid, pos, ETH_ALEN);
+ pos += ETH_ALEN;
+
+ if (length == IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM) {
+ memcpy(&entry->short_ssid, pos,
+ sizeof(entry->short_ssid));
+ entry->short_ssid_valid = true;
+ pos += 4;
+ }
+
+ /* skip non colocated APs */
+ if (!cfg80211_parse_bss_param(*pos, entry))
+ return -EINVAL;
+ pos++;
+
+ if (length == IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM) {
+ /*
+ * no information about the short ssid. Consider the entry valid
+ * for now. It would later be dropped in case there are explicit
+ * SSIDs that need to be matched
+ */
+ if (!entry->same_ssid)
+ return 0;
+ }
+
+ if (entry->same_ssid) {
+ entry->short_ssid = s_ssid_tmp;
+ entry->short_ssid_valid = true;
+
+ /*
+ * This is safe because we validate datalen in
+ * cfg80211_parse_colocated_ap(), before calling this
+ * function.
+ */
+ memcpy(&entry->ssid, &ssid_elem->data,
+ ssid_elem->datalen);
+ entry->ssid_len = ssid_elem->datalen;
+ }
+ return 0;
+}
+
+static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies,
+ struct list_head *list)
+{
+ struct ieee80211_neighbor_ap_info *ap_info;
+ const struct element *elem, *ssid_elem;
+ const u8 *pos, *end;
+ u32 s_ssid_tmp;
+ int n_coloc = 0, ret;
+ LIST_HEAD(ap_list);
+
+ elem = cfg80211_find_elem(WLAN_EID_REDUCED_NEIGHBOR_REPORT, ies->data,
+ ies->len);
+ if (!elem || elem->datalen > IEEE80211_MAX_SSID_LEN)
+ return 0;
+
+ pos = elem->data;
+ end = pos + elem->datalen;
+
+ ret = cfg80211_calc_short_ssid(ies, &ssid_elem, &s_ssid_tmp);
+ if (ret)
+ return ret;
+
+ /* RNR IE may contain more than one NEIGHBOR_AP_INFO */
+ while (pos + sizeof(*ap_info) <= end) {
+ enum nl80211_band band;
+ int freq;
+ u8 length, i, count;
+
+ ap_info = (void *)pos;
+ count = u8_get_bits(ap_info->tbtt_info_hdr,
+ IEEE80211_AP_INFO_TBTT_HDR_COUNT) + 1;
+ length = ap_info->tbtt_info_len;
+
+ pos += sizeof(*ap_info);
+
+ if (!ieee80211_operating_class_to_band(ap_info->op_class,
+ &band))
+ break;
+
+ freq = ieee80211_channel_to_frequency(ap_info->channel, band);
+
+ if (end - pos < count * ap_info->tbtt_info_len)
+ break;
+
+ /*
+ * TBTT info must include bss param + BSSID +
+ * (short SSID or same_ssid bit to be set).
+ * ignore other options, and move to the
+ * next AP info
+ */
+ if (band != NL80211_BAND_6GHZ ||
+ (length != IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM &&
+ length < IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM)) {
+ pos += count * ap_info->tbtt_info_len;
+ continue;
+ }
+
+ for (i = 0; i < count; i++) {
+ struct cfg80211_colocated_ap *entry;
+
+ entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN,
+ GFP_ATOMIC);
+
+ if (!entry)
+ break;
+
+ entry->center_freq = freq;
+
+ if (!cfg80211_parse_ap_info(entry, pos, length,
+ ssid_elem, s_ssid_tmp)) {
+ n_coloc++;
+ list_add_tail(&entry->list, &ap_list);
+ } else {
+ kfree(entry);
+ }
+
+ pos += ap_info->tbtt_info_len;
+ }
+ }
+
+ if (pos != end) {
+ cfg80211_free_coloc_ap_list(&ap_list);
+ return 0;
+ }
+
+ list_splice_tail(&ap_list, list);
+ return n_coloc;
+}
+
+static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request,
+ struct ieee80211_channel *chan,
+ bool add_to_6ghz)
+{
+ int i;
+ u32 n_channels = request->n_channels;
+ struct cfg80211_scan_6ghz_params *params =
+ &request->scan_6ghz_params[request->n_6ghz_params];
+
+ for (i = 0; i < n_channels; i++) {
+ if (request->channels[i] == chan) {
+ if (add_to_6ghz)
+ params->channel_idx = i;
+ return;
+ }
+ }
+
+ request->channels[n_channels] = chan;
+ if (add_to_6ghz)
+ request->scan_6ghz_params[request->n_6ghz_params].channel_idx =
+ n_channels;
+
+ request->n_channels++;
+}
+
+static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap,
+ struct cfg80211_scan_request *request)
+{
+ int i;
+ u32 s_ssid;
+
+ for (i = 0; i < request->n_ssids; i++) {
+ /* wildcard ssid in the scan request */
+ if (!request->ssids[i].ssid_len)
+ return true;
+
+ if (ap->ssid_len &&
+ ap->ssid_len == request->ssids[i].ssid_len) {
+ if (!memcmp(request->ssids[i].ssid, ap->ssid,
+ ap->ssid_len))
+ return true;
+ } else if (ap->short_ssid_valid) {
+ s_ssid = ~crc32_le(~0, request->ssids[i].ssid,
+ request->ssids[i].ssid_len);
+
+ if (ap->short_ssid == s_ssid)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
+{
+ u8 i;
+ struct cfg80211_colocated_ap *ap;
+ int n_channels, count = 0, err;
+ struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req;
+ LIST_HEAD(coloc_ap_list);
+ bool need_scan_psc = true;
+ const struct ieee80211_sband_iftype_data *iftd;
+
+ rdev_req->scan_6ghz = true;
+
+ if (!rdev->wiphy.bands[NL80211_BAND_6GHZ])
+ return -EOPNOTSUPP;
+
+ iftd = ieee80211_get_sband_iftype_data(rdev->wiphy.bands[NL80211_BAND_6GHZ],
+ rdev_req->wdev->iftype);
+ if (!iftd || !iftd->he_cap.has_he)
+ return -EOPNOTSUPP;
+
+ n_channels = rdev->wiphy.bands[NL80211_BAND_6GHZ]->n_channels;
+
+ if (rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) {
+ struct cfg80211_internal_bss *intbss;
+
+ spin_lock_bh(&rdev->bss_lock);
+ list_for_each_entry(intbss, &rdev->bss_list, list) {
+ struct cfg80211_bss *res = &intbss->pub;
+ const struct cfg80211_bss_ies *ies;
+
+ ies = rcu_access_pointer(res->ies);
+ count += cfg80211_parse_colocated_ap(ies,
+ &coloc_ap_list);
+ }
+ spin_unlock_bh(&rdev->bss_lock);
+ }
+
+ request = kzalloc(struct_size(request, channels, n_channels) +
+ sizeof(*request->scan_6ghz_params) * count,
+ GFP_KERNEL);
+ if (!request) {
+ cfg80211_free_coloc_ap_list(&coloc_ap_list);
+ return -ENOMEM;
+ }
+
+ *request = *rdev_req;
+ request->n_channels = 0;
+ request->scan_6ghz_params =
+ (void *)&request->channels[n_channels];
+
+ /*
+ * PSC channels should not be scanned in case of direct scan with 1 SSID
+ * and at least one of the reported co-located APs with same SSID
+ * indicating that all APs in the same ESS are co-located
+ */
+ if (count && request->n_ssids == 1 && request->ssids[0].ssid_len) {
+ list_for_each_entry(ap, &coloc_ap_list, list) {
+ if (ap->colocated_ess &&
+ cfg80211_find_ssid_match(ap, request)) {
+ need_scan_psc = false;
+ break;
+ }
+ }
+ }
+
+ /*
+ * add to the scan request the channels that need to be scanned
+ * regardless of the collocated APs (PSC channels or all channels
+ * in case that NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set)
+ */
+ for (i = 0; i < rdev_req->n_channels; i++) {
+ if (rdev_req->channels[i]->band == NL80211_BAND_6GHZ &&
+ ((need_scan_psc &&
+ cfg80211_channel_is_psc(rdev_req->channels[i])) ||
+ !(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) {
+ cfg80211_scan_req_add_chan(request,
+ rdev_req->channels[i],
+ false);
+ }
+ }
+
+ if (!(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))
+ goto skip;
+
+ list_for_each_entry(ap, &coloc_ap_list, list) {
+ bool found = false;
+ struct cfg80211_scan_6ghz_params *scan_6ghz_params =
+ &request->scan_6ghz_params[request->n_6ghz_params];
+ struct ieee80211_channel *chan =
+ ieee80211_get_channel(&rdev->wiphy, ap->center_freq);
+
+ if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
+ continue;
+
+ for (i = 0; i < rdev_req->n_channels; i++) {
+ if (rdev_req->channels[i] == chan)
+ found = true;
+ }
+
+ if (!found)
+ continue;
+
+ if (request->n_ssids > 0 &&
+ !cfg80211_find_ssid_match(ap, request))
+ continue;
+
+ cfg80211_scan_req_add_chan(request, chan, true);
+ memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN);
+ scan_6ghz_params->short_ssid = ap->short_ssid;
+ scan_6ghz_params->short_ssid_valid = ap->short_ssid_valid;
+ scan_6ghz_params->unsolicited_probe = ap->unsolicited_probe;
+
+ /*
+ * If a PSC channel is added to the scan and 'need_scan_psc' is
+ * set to false, then all the APs that the scan logic is
+ * interested with on the channel are collocated and thus there
+ * is no need to perform the initial PSC channel listen.
+ */
+ if (cfg80211_channel_is_psc(chan) && !need_scan_psc)
+ scan_6ghz_params->psc_no_listen = true;
+
+ request->n_6ghz_params++;
+ }
+
+skip:
+ cfg80211_free_coloc_ap_list(&coloc_ap_list);
+
+ if (request->n_channels) {
+ struct cfg80211_scan_request *old = rdev->int_scan_req;
+
+ rdev->int_scan_req = request;
+
+ /*
+ * If this scan follows a previous scan, save the scan start
+ * info from the first part of the scan
+ */
+ if (old)
+ rdev->int_scan_req->info = old->info;
+
+ err = rdev_scan(rdev, request);
+ if (err) {
+ rdev->int_scan_req = old;
+ kfree(request);
+ } else {
+ kfree(old);
+ }
+
+ return err;
+ }
+
+ kfree(request);
+ return -EINVAL;
+}
+
+int cfg80211_scan(struct cfg80211_registered_device *rdev)
+{
+ struct cfg80211_scan_request *request;
+ struct cfg80211_scan_request *rdev_req = rdev->scan_req;
+ u32 n_channels = 0, idx, i;
+
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ))
+ return rdev_scan(rdev, rdev_req);
+
+ for (i = 0; i < rdev_req->n_channels; i++) {
+ if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ)
+ n_channels++;
+ }
+
+ if (!n_channels)
+ return cfg80211_scan_6ghz(rdev);
+
+ request = kzalloc(struct_size(request, channels, n_channels),
+ GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ *request = *rdev_req;
+ request->n_channels = n_channels;
+
+ for (i = idx = 0; i < rdev_req->n_channels; i++) {
+ if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ)
+ request->channels[idx++] = rdev_req->channels[i];
+ }
+
+ rdev_req->scan_6ghz = false;
+ rdev->int_scan_req = request;
+ return rdev_scan(rdev, request);
+}
+
void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
bool send_message)
{
- struct cfg80211_scan_request *request;
+ struct cfg80211_scan_request *request, *rdev_req;
struct wireless_dev *wdev;
struct sk_buff *msg;
#ifdef CONFIG_CFG80211_WEXT
@@ -466,11 +926,18 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
return;
}
- request = rdev->scan_req;
- if (!request)
+ rdev_req = rdev->scan_req;
+ if (!rdev_req)
return;
- wdev = request->wdev;
+ wdev = rdev_req->wdev;
+ request = rdev->int_scan_req ? rdev->int_scan_req : rdev_req;
+
+ if (wdev_running(wdev) &&
+ (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) &&
+ !rdev_req->scan_6ghz && !request->info.aborted &&
+ !cfg80211_scan_6ghz(rdev))
+ return;
/*
* This must be before sending the other events!
@@ -501,8 +968,11 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
if (wdev->netdev)
dev_put(wdev->netdev);
+ kfree(rdev->int_scan_req);
+ rdev->int_scan_req = NULL;
+
+ kfree(rdev->scan_req);
rdev->scan_req = NULL;
- kfree(request);
if (!send_message)
rdev->scan_msg = msg;
@@ -525,10 +995,25 @@ void __cfg80211_scan_done(struct work_struct *wk)
void cfg80211_scan_done(struct cfg80211_scan_request *request,
struct cfg80211_scan_info *info)
{
+ struct cfg80211_scan_info old_info = request->info;
+
trace_cfg80211_scan_done(request, info);
- WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req);
+ WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req &&
+ request != wiphy_to_rdev(request->wiphy)->int_scan_req);
request->info = *info;
+
+ /*
+ * In case the scan is split, the scan_start_tsf and tsf_bssid should
+ * be of the first part. In such a case old_info.scan_start_tsf should
+ * be non zero.
+ */
+ if (request->scan_6ghz && old_info.scan_start_tsf) {
+ request->info.scan_start_tsf = old_info.scan_start_tsf;
+ memcpy(request->info.tsf_bssid, old_info.tsf_bssid,
+ sizeof(request->info.tsf_bssid));
+ }
+
request->notified = true;
queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk);
}
@@ -1315,15 +1800,24 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
int channel_number = -1;
struct ieee80211_channel *alt_channel;
- tmp = cfg80211_find_ie(WLAN_EID_DS_PARAMS, ie, ielen);
- if (tmp && tmp[1] == 1) {
- channel_number = tmp[2];
+ if (channel->band == NL80211_BAND_S1GHZ) {
+ tmp = cfg80211_find_ie(WLAN_EID_S1G_OPERATION, ie, ielen);
+ if (tmp && tmp[1] >= sizeof(struct ieee80211_s1g_oper_ie)) {
+ struct ieee80211_s1g_oper_ie *s1gop = (void *)(tmp + 2);
+
+ channel_number = s1gop->primary_ch;
+ }
} else {
- tmp = cfg80211_find_ie(WLAN_EID_HT_OPERATION, ie, ielen);
- if (tmp && tmp[1] >= sizeof(struct ieee80211_ht_operation)) {
- struct ieee80211_ht_operation *htop = (void *)(tmp + 2);
+ tmp = cfg80211_find_ie(WLAN_EID_DS_PARAMS, ie, ielen);
+ if (tmp && tmp[1] == 1) {
+ channel_number = tmp[2];
+ } else {
+ tmp = cfg80211_find_ie(WLAN_EID_HT_OPERATION, ie, ielen);
+ if (tmp && tmp[1] >= sizeof(struct ieee80211_ht_operation)) {
+ struct ieee80211_ht_operation *htop = (void *)(tmp + 2);
- channel_number = htop->primary_chan;
+ channel_number = htop->primary_chan;
+ }
}
}
@@ -1405,6 +1899,9 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
tmp.pub.beacon_interval = beacon_interval;
tmp.pub.capability = capability;
tmp.ts_boottime = data->boottime_ns;
+ tmp.parent_tsf = data->parent_tsf;
+ ether_addr_copy(tmp.parent_bssid, data->parent_bssid);
+
if (non_tx_data) {
tmp.pub.transmitted_bss = non_tx_data->tx_bss;
ts = bss_from_pub(non_tx_data->tx_bss)->ts;
@@ -1488,7 +1985,7 @@ static const struct element
ielen - (mbssid_end - ie));
/*
- * If is is not the last subelement in current MBSSID IE or there isn't
+ * If it is not the last subelement in current MBSSID IE or there isn't
* a next MBSSID IE - profile is complete.
*/
if ((sub_elem->data + sub_elem->datalen < mbssid_end - 1) ||
@@ -1807,8 +2304,11 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
struct cfg80211_bss_ies *ies;
struct ieee80211_channel *channel;
bool signal_valid;
- size_t ielen = len - offsetof(struct ieee80211_mgmt,
- u.probe_resp.variable);
+ struct ieee80211_ext *ext = NULL;
+ u8 *bssid, *variable;
+ u16 capability, beacon_int;
+ size_t ielen, min_hdr_len = offsetof(struct ieee80211_mgmt,
+ u.probe_resp.variable);
int bss_type;
BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) !=
@@ -1826,21 +2326,57 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
(data->signal < 0 || data->signal > 100)))
return NULL;
- if (WARN_ON(len < offsetof(struct ieee80211_mgmt, u.probe_resp.variable)))
+ if (ieee80211_is_s1g_beacon(mgmt->frame_control)) {
+ ext = (void *) mgmt;
+ min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon);
+ if (ieee80211_is_s1g_short_beacon(mgmt->frame_control))
+ min_hdr_len = offsetof(struct ieee80211_ext,
+ u.s1g_short_beacon.variable);
+ }
+
+ if (WARN_ON(len < min_hdr_len))
return NULL;
- channel = cfg80211_get_bss_channel(wiphy, mgmt->u.beacon.variable,
+ ielen = len - min_hdr_len;
+ variable = mgmt->u.probe_resp.variable;
+ if (ext) {
+ if (ieee80211_is_s1g_short_beacon(mgmt->frame_control))
+ variable = ext->u.s1g_short_beacon.variable;
+ else
+ variable = ext->u.s1g_beacon.variable;
+ }
+
+ channel = cfg80211_get_bss_channel(wiphy, variable,
ielen, data->chan, data->scan_width);
if (!channel)
return NULL;
+ if (ext) {
+ struct ieee80211_s1g_bcn_compat_ie *compat;
+ u8 *ie;
+
+ ie = (void *)cfg80211_find_ie(WLAN_EID_S1G_BCN_COMPAT,
+ variable, ielen);
+ if (!ie)
+ return NULL;
+ compat = (void *)(ie + 2);
+ bssid = ext->u.s1g_beacon.sa;
+ capability = le16_to_cpu(compat->compat_info);
+ beacon_int = le16_to_cpu(compat->beacon_int);
+ } else {
+ bssid = mgmt->bssid;
+ beacon_int = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
+ capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
+ }
+
ies = kzalloc(sizeof(*ies) + ielen, gfp);
if (!ies)
return NULL;
ies->len = ielen;
ies->tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp);
- ies->from_beacon = ieee80211_is_beacon(mgmt->frame_control);
- memcpy(ies->data, mgmt->u.probe_resp.variable, ielen);
+ ies->from_beacon = ieee80211_is_beacon(mgmt->frame_control) ||
+ ieee80211_is_s1g_beacon(mgmt->frame_control);
+ memcpy(ies->data, variable, ielen);
if (ieee80211_is_probe_resp(mgmt->frame_control))
rcu_assign_pointer(tmp.pub.proberesp_ies, ies);
@@ -1848,12 +2384,12 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
rcu_assign_pointer(tmp.pub.beacon_ies, ies);
rcu_assign_pointer(tmp.pub.ies, ies);
- memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN);
+ memcpy(tmp.pub.bssid, bssid, ETH_ALEN);
+ tmp.pub.beacon_interval = beacon_int;
+ tmp.pub.capability = capability;
tmp.pub.channel = channel;
tmp.pub.scan_width = data->scan_width;
tmp.pub.signal = data->signal;
- tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
- tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
tmp.ts_boottime = data->boottime_ns;
tmp.parent_tsf = data->parent_tsf;
tmp.pub.chains = data->chains;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 079ce320dc1e..38df713f2e2e 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -24,7 +24,7 @@
/*
* Software SME in cfg80211, using auth/assoc/deauth calls to the
- * driver. This is is for implementing nl80211's connect/disconnect
+ * driver. This is for implementing nl80211's connect/disconnect
* and wireless extensions (if configured.)
*/
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 6e218a0acd4e..76b777d5903f 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -838,11 +838,6 @@ DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_del_mpath,
TP_ARGS(wiphy, netdev, mac)
);
-DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_set_wds_peer,
- TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
- TP_ARGS(wiphy, netdev, mac)
-);
-
TRACE_EVENT(rdev_dump_station,
TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
u8 *mac),
@@ -2684,19 +2679,23 @@ DEFINE_EVENT(netdev_frame_event, cfg80211_rx_mlme_mgmt,
);
TRACE_EVENT(cfg80211_tx_mlme_mgmt,
- TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
- TP_ARGS(netdev, buf, len),
+ TP_PROTO(struct net_device *netdev, const u8 *buf, int len,
+ bool reconnect),
+ TP_ARGS(netdev, buf, len, reconnect),
TP_STRUCT__entry(
NETDEV_ENTRY
__dynamic_array(u8, frame, len)
+ __field(int, reconnect)
),
TP_fast_assign(
NETDEV_ASSIGN;
memcpy(__get_dynamic_array(frame), buf, len);
+ __entry->reconnect = reconnect;
),
- TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x",
+ TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x reconnect:%d",
NETDEV_PR_ARG,
- le16_to_cpup((__le16 *)__get_dynamic_array(frame)))
+ le16_to_cpup((__le16 *)__get_dynamic_array(frame)),
+ __entry->reconnect)
);
DECLARE_EVENT_CLASS(netdev_mac_evt,
@@ -3547,6 +3546,25 @@ TRACE_EVENT(rdev_reset_tid_config,
TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", tids: 0x%x",
WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tids)
);
+
+TRACE_EVENT(rdev_set_sar_specs,
+ TP_PROTO(struct wiphy *wiphy, struct cfg80211_sar_specs *sar),
+ TP_ARGS(wiphy, sar),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(u16, type)
+ __field(u16, num)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->type = sar->type;
+ __entry->num = sar->num_sub_specs;
+
+ ),
+ TP_printk(WIPHY_PR_FMT ", Set type:%d, num_specs:%d",
+ WIPHY_PR_ARG, __entry->type, __entry->num)
+);
+
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 6fa99df52f86..b4acc805114b 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -111,6 +111,33 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band)
}
EXPORT_SYMBOL(ieee80211_channel_to_freq_khz);
+enum nl80211_chan_width
+ieee80211_s1g_channel_width(const struct ieee80211_channel *chan)
+{
+ if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ))
+ return NL80211_CHAN_WIDTH_20_NOHT;
+
+ /*S1G defines a single allowed channel width per channel.
+ * Extract that width here.
+ */
+ if (chan->flags & IEEE80211_CHAN_1MHZ)
+ return NL80211_CHAN_WIDTH_1;
+ else if (chan->flags & IEEE80211_CHAN_2MHZ)
+ return NL80211_CHAN_WIDTH_2;
+ else if (chan->flags & IEEE80211_CHAN_4MHZ)
+ return NL80211_CHAN_WIDTH_4;
+ else if (chan->flags & IEEE80211_CHAN_8MHZ)
+ return NL80211_CHAN_WIDTH_8;
+ else if (chan->flags & IEEE80211_CHAN_16MHZ)
+ return NL80211_CHAN_WIDTH_16;
+
+ pr_err("unknown channel width for channel at %dKHz?\n",
+ ieee80211_channel_to_khz(chan));
+
+ return NL80211_CHAN_WIDTH_1;
+}
+EXPORT_SYMBOL(ieee80211_s1g_channel_width);
+
int ieee80211_freq_khz_to_channel(u32 freq)
{
/* TODO: just handle MHz for now */
@@ -245,18 +272,53 @@ bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher)
return false;
}
-int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
- struct key_params *params, int key_idx,
- bool pairwise, const u8 *mac_addr)
+static bool
+cfg80211_igtk_cipher_supported(struct cfg80211_registered_device *rdev)
{
- int max_key_idx = 5;
+ struct wiphy *wiphy = &rdev->wiphy;
+ int i;
- if (wiphy_ext_feature_isset(&rdev->wiphy,
- NL80211_EXT_FEATURE_BEACON_PROTECTION) ||
- wiphy_ext_feature_isset(&rdev->wiphy,
- NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT))
+ for (i = 0; i < wiphy->n_cipher_suites; i++) {
+ switch (wiphy->cipher_suites[i]) {
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev,
+ int key_idx, bool pairwise)
+{
+ int max_key_idx;
+
+ if (pairwise)
+ max_key_idx = 3;
+ else if (wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_BEACON_PROTECTION) ||
+ wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT))
max_key_idx = 7;
+ else if (cfg80211_igtk_cipher_supported(rdev))
+ max_key_idx = 5;
+ else
+ max_key_idx = 3;
+
if (key_idx < 0 || key_idx > max_key_idx)
+ return false;
+
+ return true;
+}
+
+int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
+ struct key_params *params, int key_idx,
+ bool pairwise, const u8 *mac_addr)
+{
+ if (!cfg80211_valid_key_idx(rdev, key_idx, pairwise))
return -EINVAL;
if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
@@ -308,6 +370,7 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
case WLAN_CIPHER_SUITE_WEP104:
if (key_idx > 3)
return -EINVAL;
+ break;
default:
break;
}
@@ -399,6 +462,11 @@ unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc)
{
unsigned int hdrlen = 24;
+ if (ieee80211_is_ext(fc)) {
+ hdrlen = 4;
+ goto out;
+ }
+
if (ieee80211_is_data(fc)) {
if (ieee80211_has_a4(fc))
hdrlen = 30;
@@ -518,8 +586,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
return -1;
break;
case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
- if (unlikely(iftype != NL80211_IFTYPE_WDS &&
- iftype != NL80211_IFTYPE_MESH_POINT &&
+ if (unlikely(iftype != NL80211_IFTYPE_MESH_POINT &&
iftype != NL80211_IFTYPE_AP_VLAN &&
iftype != NL80211_IFTYPE_STATION))
return -1;
@@ -1019,7 +1086,6 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
case NL80211_IFTYPE_P2P_GO:
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_AP_VLAN:
- case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_MESH_POINT:
/* bridging OK */
break;
@@ -1031,6 +1097,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
/* not happening */
break;
case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_NAN:
WARN_ON(1);
break;
@@ -1244,20 +1311,22 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
{
-#define SCALE 2048
- u16 mcs_divisors[12] = {
- 34133, /* 16.666666... */
- 17067, /* 8.333333... */
- 11378, /* 5.555555... */
- 8533, /* 4.166666... */
- 5689, /* 2.777777... */
- 4267, /* 2.083333... */
- 3923, /* 1.851851... */
- 3413, /* 1.666666... */
- 2844, /* 1.388888... */
- 2560, /* 1.250000... */
- 2276, /* 1.111111... */
- 2048, /* 1.000000... */
+#define SCALE 6144
+ u32 mcs_divisors[14] = {
+ 102399, /* 16.666666... */
+ 51201, /* 8.333333... */
+ 34134, /* 5.555555... */
+ 25599, /* 4.166666... */
+ 17067, /* 2.777777... */
+ 12801, /* 2.083333... */
+ 11769, /* 1.851851... */
+ 10239, /* 1.666666... */
+ 8532, /* 1.388888... */
+ 7680, /* 1.250000... */
+ 6828, /* 1.111111... */
+ 6144, /* 1.000000... */
+ 5690, /* 0.926106... */
+ 5120, /* 0.833333... */
};
u32 rates_160M[3] = { 960777777, 907400000, 816666666 };
u32 rates_969[3] = { 480388888, 453700000, 408333333 };
@@ -1269,7 +1338,7 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
u64 tmp;
u32 result;
- if (WARN_ON_ONCE(rate->mcs > 11))
+ if (WARN_ON_ONCE(rate->mcs > 13))
return 0;
if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2))
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 4d2160c989a3..fd9ad74972fb 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -49,9 +49,6 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
case IW_MODE_ADHOC:
type = NL80211_IFTYPE_ADHOC;
break;
- case IW_MODE_REPEAT:
- type = NL80211_IFTYPE_WDS;
- break;
case IW_MODE_MONITOR:
type = NL80211_IFTYPE_MONITOR;
break;
@@ -497,7 +494,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
/*
* We only need to store WEP keys, since they're the only keys that
- * can be be set before a connection is established and persist after
+ * can be set before a connection is established and persist after
* disconnecting.
*/
if (!addr && (params->cipher == WLAN_CIPHER_SUITE_WEP40 ||
@@ -1150,50 +1147,6 @@ static int cfg80211_wext_giwpower(struct net_device *dev,
return 0;
}
-static int cfg80211_wds_wext_siwap(struct net_device *dev,
- struct iw_request_info *info,
- struct sockaddr *addr, char *extra)
-{
- struct wireless_dev *wdev = dev->ieee80211_ptr;
- struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
- int err;
-
- if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS))
- return -EINVAL;
-
- if (addr->sa_family != ARPHRD_ETHER)
- return -EINVAL;
-
- if (netif_running(dev))
- return -EBUSY;
-
- if (!rdev->ops->set_wds_peer)
- return -EOPNOTSUPP;
-
- err = rdev_set_wds_peer(rdev, dev, (u8 *)&addr->sa_data);
- if (err)
- return err;
-
- memcpy(&wdev->wext.bssid, (u8 *) &addr->sa_data, ETH_ALEN);
-
- return 0;
-}
-
-static int cfg80211_wds_wext_giwap(struct net_device *dev,
- struct iw_request_info *info,
- struct sockaddr *addr, char *extra)
-{
- struct wireless_dev *wdev = dev->ieee80211_ptr;
-
- if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS))
- return -EINVAL;
-
- addr->sa_family = ARPHRD_ETHER;
- memcpy(&addr->sa_data, wdev->wext.bssid, ETH_ALEN);
-
- return 0;
-}
-
static int cfg80211_wext_siwrate(struct net_device *dev,
struct iw_request_info *info,
struct iw_param *rate, char *extra)
@@ -1371,8 +1324,6 @@ static int cfg80211_wext_siwap(struct net_device *dev,
return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra);
case NL80211_IFTYPE_STATION:
return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra);
- case NL80211_IFTYPE_WDS:
- return cfg80211_wds_wext_siwap(dev, info, ap_addr, extra);
default:
return -EOPNOTSUPP;
}
@@ -1389,8 +1340,6 @@ static int cfg80211_wext_giwap(struct net_device *dev,
return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra);
case NL80211_IFTYPE_STATION:
return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra);
- case NL80211_IFTYPE_WDS:
- return cfg80211_wds_wext_giwap(dev, info, ap_addr, extra);
default:
return -EOPNOTSUPP;
}
@@ -1472,39 +1421,78 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev,
}
}
+#define DEFINE_WEXT_COMPAT_STUB(func, type) \
+ static int __ ## func(struct net_device *dev, \
+ struct iw_request_info *info, \
+ union iwreq_data *wrqu, \
+ char *extra) \
+ { \
+ return func(dev, info, (type *)wrqu, extra); \
+ }
+
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwname, char)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwfreq, struct iw_freq)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwfreq, struct iw_freq)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwmode, u32)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwmode, u32)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrange, struct iw_point)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwap, struct sockaddr)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwap, struct sockaddr)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwmlme, struct iw_point)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwscan, struct iw_point)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwessid, struct iw_point)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwessid, struct iw_point)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwrate, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrate, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwrts, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrts, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwfrag, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwfrag, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwretry, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwretry, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwencode, struct iw_point)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwencode, struct iw_point)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwpower, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwpower, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwgenie, struct iw_point)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwauth, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwauth, struct iw_param)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwencodeext, struct iw_point)
+DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwpmksa, struct iw_point)
+
static const iw_handler cfg80211_handlers[] = {
- [IW_IOCTL_IDX(SIOCGIWNAME)] = (iw_handler) cfg80211_wext_giwname,
- [IW_IOCTL_IDX(SIOCSIWFREQ)] = (iw_handler) cfg80211_wext_siwfreq,
- [IW_IOCTL_IDX(SIOCGIWFREQ)] = (iw_handler) cfg80211_wext_giwfreq,
- [IW_IOCTL_IDX(SIOCSIWMODE)] = (iw_handler) cfg80211_wext_siwmode,
- [IW_IOCTL_IDX(SIOCGIWMODE)] = (iw_handler) cfg80211_wext_giwmode,
- [IW_IOCTL_IDX(SIOCGIWRANGE)] = (iw_handler) cfg80211_wext_giwrange,
- [IW_IOCTL_IDX(SIOCSIWAP)] = (iw_handler) cfg80211_wext_siwap,
- [IW_IOCTL_IDX(SIOCGIWAP)] = (iw_handler) cfg80211_wext_giwap,
- [IW_IOCTL_IDX(SIOCSIWMLME)] = (iw_handler) cfg80211_wext_siwmlme,
- [IW_IOCTL_IDX(SIOCSIWSCAN)] = (iw_handler) cfg80211_wext_siwscan,
- [IW_IOCTL_IDX(SIOCGIWSCAN)] = (iw_handler) cfg80211_wext_giwscan,
- [IW_IOCTL_IDX(SIOCSIWESSID)] = (iw_handler) cfg80211_wext_siwessid,
- [IW_IOCTL_IDX(SIOCGIWESSID)] = (iw_handler) cfg80211_wext_giwessid,
- [IW_IOCTL_IDX(SIOCSIWRATE)] = (iw_handler) cfg80211_wext_siwrate,
- [IW_IOCTL_IDX(SIOCGIWRATE)] = (iw_handler) cfg80211_wext_giwrate,
- [IW_IOCTL_IDX(SIOCSIWRTS)] = (iw_handler) cfg80211_wext_siwrts,
- [IW_IOCTL_IDX(SIOCGIWRTS)] = (iw_handler) cfg80211_wext_giwrts,
- [IW_IOCTL_IDX(SIOCSIWFRAG)] = (iw_handler) cfg80211_wext_siwfrag,
- [IW_IOCTL_IDX(SIOCGIWFRAG)] = (iw_handler) cfg80211_wext_giwfrag,
- [IW_IOCTL_IDX(SIOCSIWTXPOW)] = (iw_handler) cfg80211_wext_siwtxpower,
- [IW_IOCTL_IDX(SIOCGIWTXPOW)] = (iw_handler) cfg80211_wext_giwtxpower,
- [IW_IOCTL_IDX(SIOCSIWRETRY)] = (iw_handler) cfg80211_wext_siwretry,
- [IW_IOCTL_IDX(SIOCGIWRETRY)] = (iw_handler) cfg80211_wext_giwretry,
- [IW_IOCTL_IDX(SIOCSIWENCODE)] = (iw_handler) cfg80211_wext_siwencode,
- [IW_IOCTL_IDX(SIOCGIWENCODE)] = (iw_handler) cfg80211_wext_giwencode,
- [IW_IOCTL_IDX(SIOCSIWPOWER)] = (iw_handler) cfg80211_wext_siwpower,
- [IW_IOCTL_IDX(SIOCGIWPOWER)] = (iw_handler) cfg80211_wext_giwpower,
- [IW_IOCTL_IDX(SIOCSIWGENIE)] = (iw_handler) cfg80211_wext_siwgenie,
- [IW_IOCTL_IDX(SIOCSIWAUTH)] = (iw_handler) cfg80211_wext_siwauth,
- [IW_IOCTL_IDX(SIOCGIWAUTH)] = (iw_handler) cfg80211_wext_giwauth,
- [IW_IOCTL_IDX(SIOCSIWENCODEEXT)]= (iw_handler) cfg80211_wext_siwencodeext,
- [IW_IOCTL_IDX(SIOCSIWPMKSA)] = (iw_handler) cfg80211_wext_siwpmksa,
+ [IW_IOCTL_IDX(SIOCGIWNAME)] = __cfg80211_wext_giwname,
+ [IW_IOCTL_IDX(SIOCSIWFREQ)] = __cfg80211_wext_siwfreq,
+ [IW_IOCTL_IDX(SIOCGIWFREQ)] = __cfg80211_wext_giwfreq,
+ [IW_IOCTL_IDX(SIOCSIWMODE)] = __cfg80211_wext_siwmode,
+ [IW_IOCTL_IDX(SIOCGIWMODE)] = __cfg80211_wext_giwmode,
+ [IW_IOCTL_IDX(SIOCGIWRANGE)] = __cfg80211_wext_giwrange,
+ [IW_IOCTL_IDX(SIOCSIWAP)] = __cfg80211_wext_siwap,
+ [IW_IOCTL_IDX(SIOCGIWAP)] = __cfg80211_wext_giwap,
+ [IW_IOCTL_IDX(SIOCSIWMLME)] = __cfg80211_wext_siwmlme,
+ [IW_IOCTL_IDX(SIOCSIWSCAN)] = cfg80211_wext_siwscan,
+ [IW_IOCTL_IDX(SIOCGIWSCAN)] = __cfg80211_wext_giwscan,
+ [IW_IOCTL_IDX(SIOCSIWESSID)] = __cfg80211_wext_siwessid,
+ [IW_IOCTL_IDX(SIOCGIWESSID)] = __cfg80211_wext_giwessid,
+ [IW_IOCTL_IDX(SIOCSIWRATE)] = __cfg80211_wext_siwrate,
+ [IW_IOCTL_IDX(SIOCGIWRATE)] = __cfg80211_wext_giwrate,
+ [IW_IOCTL_IDX(SIOCSIWRTS)] = __cfg80211_wext_siwrts,
+ [IW_IOCTL_IDX(SIOCGIWRTS)] = __cfg80211_wext_giwrts,
+ [IW_IOCTL_IDX(SIOCSIWFRAG)] = __cfg80211_wext_siwfrag,
+ [IW_IOCTL_IDX(SIOCGIWFRAG)] = __cfg80211_wext_giwfrag,
+ [IW_IOCTL_IDX(SIOCSIWTXPOW)] = cfg80211_wext_siwtxpower,
+ [IW_IOCTL_IDX(SIOCGIWTXPOW)] = cfg80211_wext_giwtxpower,
+ [IW_IOCTL_IDX(SIOCSIWRETRY)] = __cfg80211_wext_siwretry,
+ [IW_IOCTL_IDX(SIOCGIWRETRY)] = __cfg80211_wext_giwretry,
+ [IW_IOCTL_IDX(SIOCSIWENCODE)] = __cfg80211_wext_siwencode,
+ [IW_IOCTL_IDX(SIOCGIWENCODE)] = __cfg80211_wext_giwencode,
+ [IW_IOCTL_IDX(SIOCSIWPOWER)] = __cfg80211_wext_siwpower,
+ [IW_IOCTL_IDX(SIOCGIWPOWER)] = __cfg80211_wext_giwpower,
+ [IW_IOCTL_IDX(SIOCSIWGENIE)] = __cfg80211_wext_siwgenie,
+ [IW_IOCTL_IDX(SIOCSIWAUTH)] = __cfg80211_wext_siwauth,
+ [IW_IOCTL_IDX(SIOCGIWAUTH)] = __cfg80211_wext_giwauth,
+ [IW_IOCTL_IDX(SIOCSIWENCODEEXT)]= __cfg80211_wext_siwencodeext,
+ [IW_IOCTL_IDX(SIOCSIWPMKSA)] = __cfg80211_wext_siwpmksa,
};
const struct iw_handler_def cfg80211_wext_handler = {
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 69102fda9ebd..76a80a41615b 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -896,8 +896,9 @@ out:
int call_commit_handler(struct net_device *dev)
{
#ifdef CONFIG_WIRELESS_EXT
- if ((netif_running(dev)) &&
- (dev->wireless_handlers->standard[0] != NULL))
+ if (netif_running(dev) &&
+ dev->wireless_handlers &&
+ dev->wireless_handlers->standard[0])
/* Call the commit handler on the driver */
return dev->wireless_handlers->standard[0](dev, NULL,
NULL, NULL);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 0bbb283f23c9..ff687b97b2d9 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -200,22 +200,6 @@ static void x25_remove_socket(struct sock *sk)
}
/*
- * Kill all bound sockets on a dropped device.
- */
-static void x25_kill_by_device(struct net_device *dev)
-{
- struct sock *s;
-
- write_lock_bh(&x25_list_lock);
-
- sk_for_each(s, &x25_list)
- if (x25_sk(s)->neighbour && x25_sk(s)->neighbour->dev == dev)
- x25_disconnect(s, ENETUNREACH, 0, 0);
-
- write_unlock_bh(&x25_list_lock);
-}
-
-/*
* Handle device status changes.
*/
static int x25_device_event(struct notifier_block *this, unsigned long event,
@@ -227,27 +211,33 @@ static int x25_device_event(struct notifier_block *this, unsigned long event,
if (!net_eq(dev_net(dev), &init_net))
return NOTIFY_DONE;
- if (dev->type == ARPHRD_X25
-#if IS_ENABLED(CONFIG_LLC)
- || dev->type == ARPHRD_ETHER
-#endif
- ) {
+ if (dev->type == ARPHRD_X25) {
switch (event) {
- case NETDEV_UP:
+ case NETDEV_REGISTER:
+ case NETDEV_POST_TYPE_CHANGE:
x25_link_device_up(dev);
break;
- case NETDEV_GOING_DOWN:
+ case NETDEV_DOWN:
nb = x25_get_neigh(dev);
if (nb) {
- x25_terminate_link(nb);
+ x25_link_terminated(nb);
x25_neigh_put(nb);
}
- break;
- case NETDEV_DOWN:
- x25_kill_by_device(dev);
x25_route_device_down(dev);
+ break;
+ case NETDEV_PRE_TYPE_CHANGE:
+ case NETDEV_UNREGISTER:
x25_link_device_down(dev);
break;
+ case NETDEV_CHANGE:
+ if (!netif_carrier_ok(dev)) {
+ nb = x25_get_neigh(dev);
+ if (nb) {
+ x25_link_terminated(nb);
+ x25_neigh_put(nb);
+ }
+ }
+ break;
}
}
@@ -681,7 +671,8 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
int len, i, rc = 0;
if (addr_len != sizeof(struct sockaddr_x25) ||
- addr->sx25_family != AF_X25) {
+ addr->sx25_family != AF_X25 ||
+ strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) {
rc = -EINVAL;
goto out;
}
@@ -775,7 +766,8 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
rc = -EINVAL;
if (addr_len != sizeof(struct sockaddr_x25) ||
- addr->sx25_family != AF_X25)
+ addr->sx25_family != AF_X25 ||
+ strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN)
goto out;
rc = -ENETUNREACH;
@@ -825,7 +817,7 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTED;
rc = 0;
out_put_neigh:
- if (rc) {
+ if (rc && x25->neighbour) {
read_lock_bh(&x25_list_lock);
x25_neigh_put(x25->neighbour);
x25->neighbour = NULL;
@@ -1050,6 +1042,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
makex25->lci = lci;
makex25->dest_addr = dest_addr;
makex25->source_addr = source_addr;
+ x25_neigh_hold(nb);
makex25->neighbour = nb;
makex25->facilities = facilities;
makex25->dte_facilities= dte_facilities;
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
index 25bf72ee6cad..5259ef8f5242 100644
--- a/net/x25/x25_dev.c
+++ b/net/x25/x25_dev.c
@@ -160,10 +160,6 @@ void x25_establish_link(struct x25_neigh *nb)
*ptr = X25_IFACE_CONNECT;
break;
-#if IS_ENABLED(CONFIG_LLC)
- case ARPHRD_ETHER:
- return;
-#endif
default:
return;
}
@@ -179,10 +175,6 @@ void x25_terminate_link(struct x25_neigh *nb)
struct sk_buff *skb;
unsigned char *ptr;
-#if IS_ENABLED(CONFIG_LLC)
- if (nb->dev->type == ARPHRD_ETHER)
- return;
-#endif
if (nb->dev->type != ARPHRD_X25)
return;
@@ -212,11 +204,6 @@ void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb)
*dptr = X25_IFACE_DATA;
break;
-#if IS_ENABLED(CONFIG_LLC)
- case ARPHRD_ETHER:
- kfree_skb(skb);
- return;
-#endif
default:
kfree_skb(skb);
return;
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index fdae054b7dc1..57a81100c5da 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -58,11 +58,6 @@ static inline void x25_stop_t20timer(struct x25_neigh *nb)
del_timer(&nb->t20timer);
}
-static inline int x25_t20timer_pending(struct x25_neigh *nb)
-{
- return timer_pending(&nb->t20timer);
-}
-
/*
* This handles all restart and diagnostic frames.
*/
@@ -70,20 +65,45 @@ void x25_link_control(struct sk_buff *skb, struct x25_neigh *nb,
unsigned short frametype)
{
struct sk_buff *skbn;
- int confirm;
switch (frametype) {
case X25_RESTART_REQUEST:
- confirm = !x25_t20timer_pending(nb);
- x25_stop_t20timer(nb);
- nb->state = X25_LINK_STATE_3;
- if (confirm)
+ switch (nb->state) {
+ case X25_LINK_STATE_0:
+ /* This can happen when the x25 module just gets loaded
+ * and doesn't know layer 2 has already connected
+ */
+ nb->state = X25_LINK_STATE_3;
x25_transmit_restart_confirmation(nb);
+ break;
+ case X25_LINK_STATE_2:
+ x25_stop_t20timer(nb);
+ nb->state = X25_LINK_STATE_3;
+ break;
+ case X25_LINK_STATE_3:
+ /* clear existing virtual calls */
+ x25_kill_by_neigh(nb);
+
+ x25_transmit_restart_confirmation(nb);
+ break;
+ }
break;
case X25_RESTART_CONFIRMATION:
- x25_stop_t20timer(nb);
- nb->state = X25_LINK_STATE_3;
+ switch (nb->state) {
+ case X25_LINK_STATE_2:
+ x25_stop_t20timer(nb);
+ nb->state = X25_LINK_STATE_3;
+ break;
+ case X25_LINK_STATE_3:
+ /* clear existing virtual calls */
+ x25_kill_by_neigh(nb);
+
+ x25_transmit_restart_request(nb);
+ nb->state = X25_LINK_STATE_2;
+ x25_start_t20timer(nb);
+ break;
+ }
break;
case X25_DIAGNOSTIC:
@@ -214,8 +234,6 @@ void x25_link_established(struct x25_neigh *nb)
{
switch (nb->state) {
case X25_LINK_STATE_0:
- nb->state = X25_LINK_STATE_2;
- break;
case X25_LINK_STATE_1:
x25_transmit_restart_request(nb);
nb->state = X25_LINK_STATE_2;
@@ -232,6 +250,9 @@ void x25_link_established(struct x25_neigh *nb)
void x25_link_terminated(struct x25_neigh *nb)
{
nb->state = X25_LINK_STATE_0;
+ skb_queue_purge(&nb->queue);
+ x25_stop_t20timer(nb);
+
/* Out of order: clear existing virtual calls (X.25 03/93 4.6.3) */
x25_kill_by_neigh(nb);
}
@@ -277,9 +298,6 @@ void x25_link_device_up(struct net_device *dev)
*/
static void __x25_remove_neigh(struct x25_neigh *nb)
{
- skb_queue_purge(&nb->queue);
- x25_stop_t20timer(nb);
-
if (nb->node.next) {
list_del(&nb->node);
x25_neigh_put(nb);
diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c
index 00e46c9a5280..9fbe4bb38d94 100644
--- a/net/x25/x25_route.c
+++ b/net/x25/x25_route.c
@@ -115,9 +115,6 @@ void x25_route_device_down(struct net_device *dev)
__x25_remove_route(rt);
}
write_unlock_bh(&x25_route_list_lock);
-
- /* Remove any related forwarding */
- x25_clear_forward_by_dev(dev);
}
/*
@@ -127,12 +124,7 @@ struct net_device *x25_dev_get(char *devname)
{
struct net_device *dev = dev_get_by_name(&init_net, devname);
- if (dev &&
- (!(dev->flags & IFF_UP) || (dev->type != ARPHRD_X25
-#if IS_ENABLED(CONFIG_LLC)
- && dev->type != ARPHRD_ETHER
-#endif
- ))){
+ if (dev && (!(dev->flags & IFF_UP) || dev->type != ARPHRD_X25)) {
dev_put(dev);
dev = NULL;
}
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index b010bfde0149..56a28a686988 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -23,162 +23,6 @@
static DEFINE_IDA(umem_ida);
-void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
-{
- unsigned long flags;
-
- if (!xs->tx)
- return;
-
- spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
- list_add_rcu(&xs->list, &umem->xsk_tx_list);
- spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
-}
-
-void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
-{
- unsigned long flags;
-
- if (!xs->tx)
- return;
-
- spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
- list_del_rcu(&xs->list);
- spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
-}
-
-/* The umem is stored both in the _rx struct and the _tx struct as we do
- * not know if the device has more tx queues than rx, or the opposite.
- * This might also change during run time.
- */
-static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
- u16 queue_id)
-{
- if (queue_id >= max_t(unsigned int,
- dev->real_num_rx_queues,
- dev->real_num_tx_queues))
- return -EINVAL;
-
- if (queue_id < dev->real_num_rx_queues)
- dev->_rx[queue_id].umem = umem;
- if (queue_id < dev->real_num_tx_queues)
- dev->_tx[queue_id].umem = umem;
-
- return 0;
-}
-
-struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
- u16 queue_id)
-{
- if (queue_id < dev->real_num_rx_queues)
- return dev->_rx[queue_id].umem;
- if (queue_id < dev->real_num_tx_queues)
- return dev->_tx[queue_id].umem;
-
- return NULL;
-}
-EXPORT_SYMBOL(xdp_get_umem_from_qid);
-
-static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
-{
- if (queue_id < dev->real_num_rx_queues)
- dev->_rx[queue_id].umem = NULL;
- if (queue_id < dev->real_num_tx_queues)
- dev->_tx[queue_id].umem = NULL;
-}
-
-int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
- u16 queue_id, u16 flags)
-{
- bool force_zc, force_copy;
- struct netdev_bpf bpf;
- int err = 0;
-
- ASSERT_RTNL();
-
- force_zc = flags & XDP_ZEROCOPY;
- force_copy = flags & XDP_COPY;
-
- if (force_zc && force_copy)
- return -EINVAL;
-
- if (xdp_get_umem_from_qid(dev, queue_id))
- return -EBUSY;
-
- err = xdp_reg_umem_at_qid(dev, umem, queue_id);
- if (err)
- return err;
-
- umem->dev = dev;
- umem->queue_id = queue_id;
-
- if (flags & XDP_USE_NEED_WAKEUP) {
- umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
- /* Tx needs to be explicitly woken up the first time.
- * Also for supporting drivers that do not implement this
- * feature. They will always have to call sendto().
- */
- xsk_set_tx_need_wakeup(umem);
- }
-
- dev_hold(dev);
-
- if (force_copy)
- /* For copy-mode, we are done. */
- return 0;
-
- if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
- err = -EOPNOTSUPP;
- goto err_unreg_umem;
- }
-
- bpf.command = XDP_SETUP_XSK_UMEM;
- bpf.xsk.umem = umem;
- bpf.xsk.queue_id = queue_id;
-
- err = dev->netdev_ops->ndo_bpf(dev, &bpf);
- if (err)
- goto err_unreg_umem;
-
- umem->zc = true;
- return 0;
-
-err_unreg_umem:
- if (!force_zc)
- err = 0; /* fallback to copy mode */
- if (err)
- xdp_clear_umem_at_qid(dev, queue_id);
- return err;
-}
-
-void xdp_umem_clear_dev(struct xdp_umem *umem)
-{
- struct netdev_bpf bpf;
- int err;
-
- ASSERT_RTNL();
-
- if (!umem->dev)
- return;
-
- if (umem->zc) {
- bpf.command = XDP_SETUP_XSK_UMEM;
- bpf.xsk.umem = NULL;
- bpf.xsk.queue_id = umem->queue_id;
-
- err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
-
- if (err)
- WARN(1, "failed to disable umem!\n");
- }
-
- xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
-
- dev_put(umem->dev);
- umem->dev = NULL;
- umem->zc = false;
-}
-
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
{
unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
@@ -195,25 +39,27 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
}
}
-static void xdp_umem_release(struct xdp_umem *umem)
+static void xdp_umem_addr_unmap(struct xdp_umem *umem)
{
- rtnl_lock();
- xdp_umem_clear_dev(umem);
- rtnl_unlock();
-
- ida_simple_remove(&umem_ida, umem->id);
+ vunmap(umem->addrs);
+ umem->addrs = NULL;
+}
- if (umem->fq) {
- xskq_destroy(umem->fq);
- umem->fq = NULL;
- }
+static int xdp_umem_addr_map(struct xdp_umem *umem, struct page **pages,
+ u32 nr_pages)
+{
+ umem->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!umem->addrs)
+ return -ENOMEM;
+ return 0;
+}
- if (umem->cq) {
- xskq_destroy(umem->cq);
- umem->cq = NULL;
- }
+static void xdp_umem_release(struct xdp_umem *umem)
+{
+ umem->zc = false;
+ ida_simple_remove(&umem_ida, umem->id);
- xp_destroy(umem->pool);
+ xdp_umem_addr_unmap(umem);
xdp_umem_unpin_pages(umem);
xdp_umem_unaccount_pages(umem);
@@ -232,14 +78,18 @@ void xdp_get_umem(struct xdp_umem *umem)
refcount_inc(&umem->users);
}
-void xdp_put_umem(struct xdp_umem *umem)
+void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup)
{
if (!umem)
return;
if (refcount_dec_and_test(&umem->users)) {
- INIT_WORK(&umem->work, xdp_umem_release_deferred);
- schedule_work(&umem->work);
+ if (defer_cleanup) {
+ INIT_WORK(&umem->work, xdp_umem_release_deferred);
+ schedule_work(&umem->work);
+ } else {
+ xdp_umem_release(umem);
+ }
}
}
@@ -319,8 +169,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
return -EINVAL;
}
- if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
- XDP_UMEM_USES_NEED_WAKEUP))
+ if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
return -EINVAL;
if (!unaligned_chunks && !is_power_of_2(chunk_size))
@@ -355,13 +204,13 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
umem->size = size;
umem->headroom = headroom;
umem->chunk_size = chunk_size;
+ umem->chunks = chunks;
umem->npgs = (u32)npgs;
umem->pgs = NULL;
umem->user = NULL;
umem->flags = mr->flags;
- INIT_LIST_HEAD(&umem->xsk_tx_list);
- spin_lock_init(&umem->xsk_tx_list_lock);
+ INIT_LIST_HEAD(&umem->xsk_dma_list);
refcount_set(&umem->users, 1);
err = xdp_umem_account_pages(umem);
@@ -372,15 +221,13 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (err)
goto out_account;
- umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size,
- headroom, size, unaligned_chunks);
- if (!umem->pool) {
- err = -ENOMEM;
- goto out_pin;
- }
+ err = xdp_umem_addr_map(umem, umem->pgs, umem->npgs);
+ if (err)
+ goto out_unpin;
+
return 0;
-out_pin:
+out_unpin:
xdp_umem_unpin_pages(umem);
out_account:
xdp_umem_unaccount_pages(umem);
@@ -412,8 +259,3 @@ struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
return umem;
}
-
-bool xdp_umem_validate_queues(struct xdp_umem *umem)
-{
- return umem->fq && umem->cq;
-}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 32067fe98f65..aa9fe2780410 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -8,14 +8,8 @@
#include <net/xdp_sock_drv.h>
-int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
- u16 queue_id, u16 flags);
-void xdp_umem_clear_dev(struct xdp_umem *umem);
-bool xdp_umem_validate_queues(struct xdp_umem *umem);
void xdp_get_umem(struct xdp_umem *umem);
-void xdp_put_umem(struct xdp_umem *umem);
-void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
-void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
+void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup);
struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 6c5e09e7440a..4a83117507f5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -23,6 +23,7 @@
#include <linux/netdevice.h>
#include <linux/rculist.h>
#include <net/xdp_sock_drv.h>
+#include <net/busy_poll.h>
#include <net/xdp.h>
#include "xsk_queue.h"
@@ -33,71 +34,105 @@
static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
-bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
+void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
- return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) &&
- READ_ONCE(xs->umem->fq);
-}
-
-void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
-{
- if (umem->need_wakeup & XDP_WAKEUP_RX)
+ if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
return;
- umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
- umem->need_wakeup |= XDP_WAKEUP_RX;
+ pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
+ pool->cached_need_wakeup |= XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
-void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
{
struct xdp_sock *xs;
- if (umem->need_wakeup & XDP_WAKEUP_TX)
+ if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
return;
rcu_read_lock();
- list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
}
rcu_read_unlock();
- umem->need_wakeup |= XDP_WAKEUP_TX;
+ pool->cached_need_wakeup |= XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
-void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
{
- if (!(umem->need_wakeup & XDP_WAKEUP_RX))
+ if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
return;
- umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
- umem->need_wakeup &= ~XDP_WAKEUP_RX;
+ pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+ pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
-void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
{
struct xdp_sock *xs;
- if (!(umem->need_wakeup & XDP_WAKEUP_TX))
+ if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
return;
rcu_read_lock();
- list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
}
rcu_read_unlock();
- umem->need_wakeup &= ~XDP_WAKEUP_TX;
+ pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
-bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
+{
+ return pool->uses_need_wakeup;
+}
+EXPORT_SYMBOL(xsk_uses_need_wakeup);
+
+struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
+ u16 queue_id)
+{
+ if (queue_id < dev->real_num_rx_queues)
+ return dev->_rx[queue_id].pool;
+ if (queue_id < dev->real_num_tx_queues)
+ return dev->_tx[queue_id].pool;
+
+ return NULL;
+}
+EXPORT_SYMBOL(xsk_get_pool_from_qid);
+
+void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
+{
+ if (queue_id < dev->num_rx_queues)
+ dev->_rx[queue_id].pool = NULL;
+ if (queue_id < dev->num_tx_queues)
+ dev->_tx[queue_id].pool = NULL;
+}
+
+/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
+ * not know if the device has more tx queues than rx, or the opposite.
+ * This might also change during run time.
+ */
+int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
+ u16 queue_id)
{
- return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
+ if (queue_id >= max_t(unsigned int,
+ dev->real_num_rx_queues,
+ dev->real_num_tx_queues))
+ return -EINVAL;
+
+ if (queue_id < dev->real_num_rx_queues)
+ dev->_rx[queue_id].pool = pool;
+ if (queue_id < dev->real_num_tx_queues)
+ dev->_tx[queue_id].pool = pool;
+
+ return 0;
}
-EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
void xp_release(struct xdp_buff_xsk *xskb)
{
@@ -155,12 +190,12 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
struct xdp_buff *xsk_xdp;
int err;
- if (len > xsk_umem_get_rx_frame_size(xs->umem)) {
+ if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
xs->rx_dropped++;
return -ENOSPC;
}
- xsk_xdp = xsk_buff_alloc(xs->umem);
+ xsk_xdp = xsk_buff_alloc(xs->pool);
if (!xsk_xdp) {
xs->rx_dropped++;
return -ENOSPC;
@@ -177,6 +212,14 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
return 0;
}
+static bool xsk_tx_writeable(struct xdp_sock *xs)
+{
+ if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
+ return false;
+
+ return true;
+}
+
static bool xsk_is_bound(struct xdp_sock *xs)
{
if (READ_ONCE(xs->state) == XSK_BOUND) {
@@ -198,6 +241,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
return -EINVAL;
+ sk_mark_napi_id_once_xdp(&xs->sk, xdp);
len = xdp->data_end - xdp->data;
return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
@@ -208,7 +252,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
static void xsk_flush(struct xdp_sock *xs)
{
xskq_prod_submit(xs->rx);
- __xskq_cons_release(xs->umem->fq);
+ __xskq_cons_release(xs->pool->fq);
sock_def_readable(&xs->sk);
}
@@ -249,32 +293,33 @@ void __xsk_map_flush(void)
}
}
-void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
{
- xskq_prod_submit_n(umem->cq, nb_entries);
+ xskq_prod_submit_n(pool->cq, nb_entries);
}
-EXPORT_SYMBOL(xsk_umem_complete_tx);
+EXPORT_SYMBOL(xsk_tx_completed);
-void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+void xsk_tx_release(struct xsk_buff_pool *pool)
{
struct xdp_sock *xs;
rcu_read_lock();
- list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
__xskq_cons_release(xs->tx);
- xs->sk.sk_write_space(&xs->sk);
+ if (xsk_tx_writeable(xs))
+ xs->sk.sk_write_space(&xs->sk);
}
rcu_read_unlock();
}
-EXPORT_SYMBOL(xsk_umem_consume_tx_done);
+EXPORT_SYMBOL(xsk_tx_release);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
+bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
{
struct xdp_sock *xs;
rcu_read_lock();
- list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
- if (!xskq_cons_peek_desc(xs->tx, desc, umem)) {
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+ if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
xs->tx->queue_empty_descs++;
continue;
}
@@ -284,7 +329,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- if (xskq_prod_reserve_addr(umem->cq, desc->addr))
+ if (xskq_prod_reserve_addr(pool->cq, desc->addr))
goto out;
xskq_cons_release(xs->tx);
@@ -296,7 +341,64 @@ out:
rcu_read_unlock();
return false;
}
-EXPORT_SYMBOL(xsk_umem_consume_tx);
+EXPORT_SYMBOL(xsk_tx_peek_desc);
+
+static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
+ u32 max_entries)
+{
+ u32 nb_pkts = 0;
+
+ while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
+ nb_pkts++;
+
+ xsk_tx_release(pool);
+ return nb_pkts;
+}
+
+u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
+ u32 max_entries)
+{
+ struct xdp_sock *xs;
+ u32 nb_pkts;
+
+ rcu_read_lock();
+ if (!list_is_singular(&pool->xsk_tx_list)) {
+ /* Fallback to the non-batched version */
+ rcu_read_unlock();
+ return xsk_tx_peek_release_fallback(pool, descs, max_entries);
+ }
+
+ xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
+ if (!xs) {
+ nb_pkts = 0;
+ goto out;
+ }
+
+ nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
+ if (!nb_pkts) {
+ xs->tx->queue_empty_descs++;
+ goto out;
+ }
+
+ /* This is the backpressure mechanism for the Tx path. Try to
+ * reserve space in the completion queue for all packets, but
+ * if there are fewer slots available, just process that many
+ * packets. This avoids having to implement any buffering in
+ * the Tx path.
+ */
+ nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
+ if (!nb_pkts)
+ goto out;
+
+ xskq_cons_release_n(xs->tx, nb_pkts);
+ __xskq_cons_release(xs->tx);
+ xs->sk.sk_write_space(&xs->sk);
+
+out:
+ rcu_read_unlock();
+ return nb_pkts;
+}
+EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
{
@@ -321,9 +423,9 @@ static void xsk_destruct_skb(struct sk_buff *skb)
struct xdp_sock *xs = xdp_sk(skb->sk);
unsigned long flags;
- spin_lock_irqsave(&xs->tx_completion_lock, flags);
- xskq_prod_submit_addr(xs->umem->cq, addr);
- spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
+ spin_lock_irqsave(&xs->pool->cq_lock, flags);
+ xskq_prod_submit_addr(xs->pool->cq, addr);
+ spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
sock_wfree(skb);
}
@@ -335,6 +437,7 @@ static int xsk_generic_xmit(struct sock *sk)
bool sent_frame = false;
struct xdp_desc desc;
struct sk_buff *skb;
+ unsigned long flags;
int err = 0;
mutex_lock(&xs->mutex);
@@ -342,7 +445,7 @@ static int xsk_generic_xmit(struct sock *sk)
if (xs->queue_id >= xs->dev->real_num_tx_queues)
goto out;
- while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) {
+ while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
char *buffer;
u64 addr;
u32 len;
@@ -359,17 +462,20 @@ static int xsk_generic_xmit(struct sock *sk)
skb_put(skb, len);
addr = desc.addr;
- buffer = xsk_buff_raw_get_data(xs->umem, addr);
+ buffer = xsk_buff_raw_get_data(xs->pool, addr);
err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) {
+ spin_lock_irqsave(&xs->pool->cq_lock, flags);
+ if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+ spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
kfree_skb(skb);
goto out;
}
+ spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
skb->dev = xs->dev;
skb->priority = sk->sk_priority;
@@ -377,14 +483,13 @@ static int xsk_generic_xmit(struct sock *sk)
skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
skb->destructor = xsk_destruct_skb;
- /* Hinder dev_direct_xmit from freeing the packet and
- * therefore completing it in the destructor
- */
- refcount_inc(&skb->users);
- err = dev_direct_xmit(skb, xs->queue_id);
+ err = __dev_direct_xmit(skb, xs->queue_id);
if (err == NETDEV_TX_BUSY) {
/* Tell user-space to retry the send */
skb->destructor = sock_wfree;
+ spin_lock_irqsave(&xs->pool->cq_lock, flags);
+ xskq_prod_cancel(xs->pool->cq);
+ spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
/* Free skb without triggering the perf drop trace */
consume_skb(skb);
err = -EAGAIN;
@@ -395,12 +500,10 @@ static int xsk_generic_xmit(struct sock *sk)
/* Ignore NET_XMIT_CN as packet might have been sent */
if (err == NET_XMIT_DROP) {
/* SKB completed but not sent */
- kfree_skb(skb);
err = -EBUSY;
goto out;
}
- consume_skb(skb);
sent_frame = true;
}
@@ -408,7 +511,8 @@ static int xsk_generic_xmit(struct sock *sk)
out:
if (sent_frame)
- sk->sk_write_space(sk);
+ if (xsk_tx_writeable(xs))
+ sk->sk_write_space(sk);
mutex_unlock(&xs->mutex);
return err;
@@ -426,36 +530,85 @@ static int __xsk_sendmsg(struct sock *sk)
return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
}
+static bool xsk_no_wakeup(struct sock *sk)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ /* Prefer busy-polling, skip the wakeup. */
+ return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
+ READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
+#else
+ return false;
+#endif
+}
+
static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
+ struct xsk_buff_pool *pool;
+
+ if (unlikely(!xsk_is_bound(xs)))
+ return -ENXIO;
+ if (unlikely(need_wait))
+ return -EOPNOTSUPP;
+
+ if (sk_can_busy_loop(sk))
+ sk_busy_loop(sk, 1); /* only support non-blocking sockets */
+
+ if (xsk_no_wakeup(sk))
+ return 0;
+
+ pool = xs->pool;
+ if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
+ return __xsk_sendmsg(sk);
+ return 0;
+}
+
+static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
+{
+ bool need_wait = !(flags & MSG_DONTWAIT);
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
if (unlikely(!xsk_is_bound(xs)))
return -ENXIO;
+ if (unlikely(!(xs->dev->flags & IFF_UP)))
+ return -ENETDOWN;
+ if (unlikely(!xs->rx))
+ return -ENOBUFS;
if (unlikely(need_wait))
return -EOPNOTSUPP;
- return __xsk_sendmsg(sk);
+ if (sk_can_busy_loop(sk))
+ sk_busy_loop(sk, 1); /* only support non-blocking sockets */
+
+ if (xsk_no_wakeup(sk))
+ return 0;
+
+ if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
+ return xsk_wakeup(xs, XDP_WAKEUP_RX);
+ return 0;
}
static __poll_t xsk_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
- __poll_t mask = datagram_poll(file, sock, wait);
+ __poll_t mask = 0;
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
- struct xdp_umem *umem;
+ struct xsk_buff_pool *pool;
+
+ sock_poll_wait(file, sock, wait);
if (unlikely(!xsk_is_bound(xs)))
return mask;
- umem = xs->umem;
+ pool = xs->pool;
- if (umem->need_wakeup) {
+ if (pool->cached_need_wakeup) {
if (xs->zc)
- xsk_wakeup(xs, umem->need_wakeup);
+ xsk_wakeup(xs, pool->cached_need_wakeup);
else
/* Poll needs to drive Tx also in copy mode */
__xsk_sendmsg(sk);
@@ -463,7 +616,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
if (xs->rx && !xskq_prod_is_empty(xs->rx))
mask |= EPOLLIN | EPOLLRDNORM;
- if (xs->tx && !xskq_cons_is_full(xs->tx))
+ if (xs->tx && xsk_tx_writeable(xs))
mask |= EPOLLOUT | EPOLLWRNORM;
return mask;
@@ -496,7 +649,7 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
WRITE_ONCE(xs->state, XSK_UNBOUND);
/* Wait for driver to stop using the xdp socket. */
- xdp_del_sk_umem(xs->umem, xs);
+ xp_del_xsk(xs->pool, xs);
xs->dev = NULL;
synchronize_net();
dev_put(dev);
@@ -514,7 +667,7 @@ static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
node);
if (node) {
- WARN_ON(xsk_map_inc(node->map));
+ bpf_map_inc(&node->map->map);
map = node->map;
*map_entry = node->map_entry;
}
@@ -544,7 +697,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
xsk_map_try_sock_delete(map, xs, map_entry);
- xsk_map_put(map);
+ bpf_map_put(&map->map);
}
}
@@ -574,6 +727,8 @@ static int xsk_release(struct socket *sock)
xskq_destroy(xs->rx);
xskq_destroy(xs->tx);
+ xskq_destroy(xs->fq_tmp);
+ xskq_destroy(xs->cq_tmp);
sock_orphan(sk);
sock->sk = NULL;
@@ -601,6 +756,11 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
return sock;
}
+static bool xsk_validate_queues(struct xdp_sock *xs)
+{
+ return xs->fq_tmp && xs->cq_tmp;
+}
+
static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -669,29 +829,70 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
sockfd_put(sock);
goto out_unlock;
}
- if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
- err = -EINVAL;
- sockfd_put(sock);
- goto out_unlock;
+
+ if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
+ /* Share the umem with another socket on another qid
+ * and/or device.
+ */
+ xs->pool = xp_create_and_assign_umem(xs,
+ umem_xs->umem);
+ if (!xs->pool) {
+ err = -ENOMEM;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
+ err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
+ dev, qid);
+ if (err) {
+ xp_destroy(xs->pool);
+ xs->pool = NULL;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+ } else {
+ /* Share the buffer pool with the other socket. */
+ if (xs->fq_tmp || xs->cq_tmp) {
+ /* Do not allow setting your own fq or cq. */
+ err = -EINVAL;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
+ xp_get_pool(umem_xs->pool);
+ xs->pool = umem_xs->pool;
}
xdp_get_umem(umem_xs->umem);
WRITE_ONCE(xs->umem, umem_xs->umem);
sockfd_put(sock);
- } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
+ } else if (!xs->umem || !xsk_validate_queues(xs)) {
err = -EINVAL;
goto out_unlock;
} else {
/* This xsk has its own umem. */
- err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
- if (err)
+ xs->pool = xp_create_and_assign_umem(xs, xs->umem);
+ if (!xs->pool) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ err = xp_assign_dev(xs->pool, dev, qid, flags);
+ if (err) {
+ xp_destroy(xs->pool);
+ xs->pool = NULL;
goto out_unlock;
+ }
}
+ /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
+ xs->fq_tmp = NULL;
+ xs->cq_tmp = NULL;
+
xs->dev = dev;
xs->zc = xs->umem->zc;
xs->queue_id = qid;
- xdp_add_sk_umem(xs->umem, xs);
+ xp_add_xsk(xs->pool, xs);
out_unlock:
if (err) {
@@ -797,16 +998,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
mutex_unlock(&xs->mutex);
return -EBUSY;
}
- if (!xs->umem) {
- mutex_unlock(&xs->mutex);
- return -EINVAL;
- }
- q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
- &xs->umem->cq;
+ q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
+ &xs->cq_tmp;
err = xsk_init_queue(entries, q, true);
- if (optname == XDP_UMEM_FILL_RING)
- xp_set_fq(xs->umem->pool, *q);
mutex_unlock(&xs->mutex);
return err;
}
@@ -873,7 +1068,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
if (extra_stats) {
stats.rx_ring_full = xs->rx_queue_full;
stats.rx_fill_ring_empty_descs =
- xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0;
+ xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
} else {
stats.rx_dropped += xs->rx_queue_full;
@@ -975,7 +1170,6 @@ static int xsk_mmap(struct file *file, struct socket *sock,
unsigned long size = vma->vm_end - vma->vm_start;
struct xdp_sock *xs = xdp_sk(sock->sk);
struct xsk_queue *q = NULL;
- struct xdp_umem *umem;
unsigned long pfn;
struct page *qpg;
@@ -987,16 +1181,12 @@ static int xsk_mmap(struct file *file, struct socket *sock,
} else if (offset == XDP_PGOFF_TX_RING) {
q = READ_ONCE(xs->tx);
} else {
- umem = READ_ONCE(xs->umem);
- if (!umem)
- return -EINVAL;
-
/* Matches the smp_wmb() in XDP_UMEM_REG */
smp_rmb();
if (offset == XDP_UMEM_PGOFF_FILL_RING)
- q = READ_ONCE(umem->fq);
+ q = READ_ONCE(xs->fq_tmp);
else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
- q = READ_ONCE(umem->cq);
+ q = READ_ONCE(xs->cq_tmp);
}
if (!q)
@@ -1034,8 +1224,8 @@ static int xsk_notifier(struct notifier_block *this,
xsk_unbind_dev(xs);
- /* Clear device references in umem. */
- xdp_umem_clear_dev(xs->umem);
+ /* Clear device references. */
+ xp_clear_dev(xs->pool);
}
mutex_unlock(&xs->mutex);
}
@@ -1067,7 +1257,7 @@ static const struct proto_ops xsk_proto_ops = {
.setsockopt = xsk_setsockopt,
.getsockopt = xsk_getsockopt,
.sendmsg = xsk_sendmsg,
- .recvmsg = sock_no_recvmsg,
+ .recvmsg = xsk_recvmsg,
.mmap = xsk_mmap,
.sendpage = sock_no_sendpage,
};
@@ -1079,7 +1269,8 @@ static void xsk_destruct(struct sock *sk)
if (!sock_flag(sk, SOCK_DEAD))
return;
- xdp_put_umem(xs->umem);
+ if (!xp_put_pool(xs->pool))
+ xdp_put_umem(xs->umem, !xs->pool);
sk_refcnt_debug_dec(sk);
}
@@ -1087,8 +1278,8 @@ static void xsk_destruct(struct sock *sk)
static int xsk_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
- struct sock *sk;
struct xdp_sock *xs;
+ struct sock *sk;
if (!ns_capable(net->user_ns, CAP_NET_RAW))
return -EPERM;
@@ -1119,7 +1310,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
xs->state = XSK_READY;
mutex_init(&xs->mutex);
spin_lock_init(&xs->rx_lock);
- spin_lock_init(&xs->tx_completion_lock);
INIT_LIST_HEAD(&xs->map_list);
spin_lock_init(&xs->map_list_lock);
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index 455ddd480f3d..edcf249ad1f1 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -11,13 +11,6 @@
#define XSK_NEXT_PG_CONTIG_SHIFT 0
#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
-/* Flags for the umem flags field.
- *
- * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
- * flags. See inlude/uapi/include/linux/if_xdp.h.
- */
-#define XDP_UMEM_USES_NEED_WAKEUP BIT(1)
-
struct xdp_ring_offset_v1 {
__u64 producer;
__u64 consumer;
@@ -46,10 +39,10 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk)
return (struct xdp_sock *)sk;
}
-bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry);
-int xsk_map_inc(struct xsk_map *map);
-void xsk_map_put(struct xsk_map *map);
+void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
+int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
+ u16 queue_id);
#endif /* XSK_H_ */
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index a2044c245215..20598eea658c 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -2,21 +2,34 @@
#include <net/xsk_buff_pool.h>
#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include "xsk_queue.h"
+#include "xdp_umem.h"
+#include "xsk.h"
-static void xp_addr_unmap(struct xsk_buff_pool *pool)
+void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
{
- vunmap(pool->addrs);
+ unsigned long flags;
+
+ if (!xs->tx)
+ return;
+
+ spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
+ list_add_rcu(&xs->tx_list, &pool->xsk_tx_list);
+ spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
}
-static int xp_addr_map(struct xsk_buff_pool *pool,
- struct page **pages, u32 nr_pages)
+void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
{
- pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- if (!pool->addrs)
- return -ENOMEM;
- return 0;
+ unsigned long flags;
+
+ if (!xs->tx)
+ return;
+
+ spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
+ list_del_rcu(&xs->tx_list);
+ spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
}
void xp_destroy(struct xsk_buff_pool *pool)
@@ -24,59 +37,60 @@ void xp_destroy(struct xsk_buff_pool *pool)
if (!pool)
return;
- xp_addr_unmap(pool);
kvfree(pool->heads);
kvfree(pool);
}
-struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
- u32 chunk_size, u32 headroom, u64 size,
- bool unaligned)
+struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
+ struct xdp_umem *umem)
{
struct xsk_buff_pool *pool;
struct xdp_buff_xsk *xskb;
- int err;
u32 i;
- pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL);
+ pool = kvzalloc(struct_size(pool, free_heads, umem->chunks),
+ GFP_KERNEL);
if (!pool)
goto out;
- pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL);
+ pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL);
if (!pool->heads)
goto out;
- pool->chunk_mask = ~((u64)chunk_size - 1);
- pool->addrs_cnt = size;
- pool->heads_cnt = chunks;
- pool->free_heads_cnt = chunks;
- pool->headroom = headroom;
- pool->chunk_size = chunk_size;
- pool->unaligned = unaligned;
- pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM;
+ pool->chunk_mask = ~((u64)umem->chunk_size - 1);
+ pool->addrs_cnt = umem->size;
+ pool->heads_cnt = umem->chunks;
+ pool->free_heads_cnt = umem->chunks;
+ pool->headroom = umem->headroom;
+ pool->chunk_size = umem->chunk_size;
+ pool->unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+ pool->frame_len = umem->chunk_size - umem->headroom -
+ XDP_PACKET_HEADROOM;
+ pool->umem = umem;
+ pool->addrs = umem->addrs;
INIT_LIST_HEAD(&pool->free_list);
+ INIT_LIST_HEAD(&pool->xsk_tx_list);
+ spin_lock_init(&pool->xsk_tx_list_lock);
+ spin_lock_init(&pool->cq_lock);
+ refcount_set(&pool->users, 1);
+
+ pool->fq = xs->fq_tmp;
+ pool->cq = xs->cq_tmp;
for (i = 0; i < pool->free_heads_cnt; i++) {
xskb = &pool->heads[i];
xskb->pool = pool;
- xskb->xdp.frame_sz = chunk_size - headroom;
+ xskb->xdp.frame_sz = umem->chunk_size - umem->headroom;
pool->free_heads[i] = xskb;
}
- err = xp_addr_map(pool, pages, nr_pages);
- if (!err)
- return pool;
+ return pool;
out:
xp_destroy(pool);
return NULL;
}
-void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq)
-{
- pool->fq = fq;
-}
-
void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
{
u32 i;
@@ -86,70 +100,325 @@ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
}
EXPORT_SYMBOL(xp_set_rxq_info);
-void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
+static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
{
- dma_addr_t *dma;
- u32 i;
+ struct netdev_bpf bpf;
+ int err;
- if (pool->dma_pages_cnt == 0)
+ ASSERT_RTNL();
+
+ if (pool->umem->zc) {
+ bpf.command = XDP_SETUP_XSK_POOL;
+ bpf.xsk.pool = NULL;
+ bpf.xsk.queue_id = pool->queue_id;
+
+ err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
+
+ if (err)
+ WARN(1, "Failed to disable zero-copy!\n");
+ }
+}
+
+static int __xp_assign_dev(struct xsk_buff_pool *pool,
+ struct net_device *netdev, u16 queue_id, u16 flags)
+{
+ bool force_zc, force_copy;
+ struct netdev_bpf bpf;
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ force_zc = flags & XDP_ZEROCOPY;
+ force_copy = flags & XDP_COPY;
+
+ if (force_zc && force_copy)
+ return -EINVAL;
+
+ if (xsk_get_pool_from_qid(netdev, queue_id))
+ return -EBUSY;
+
+ pool->netdev = netdev;
+ pool->queue_id = queue_id;
+ err = xsk_reg_pool_at_qid(netdev, pool, queue_id);
+ if (err)
+ return err;
+
+ if (flags & XDP_USE_NEED_WAKEUP)
+ pool->uses_need_wakeup = true;
+ /* Tx needs to be explicitly woken up the first time. Also
+ * for supporting drivers that do not implement this
+ * feature. They will always have to call sendto() or poll().
+ */
+ pool->cached_need_wakeup = XDP_WAKEUP_TX;
+
+ dev_hold(netdev);
+
+ if (force_copy)
+ /* For copy-mode, we are done. */
+ return 0;
+
+ if (!netdev->netdev_ops->ndo_bpf ||
+ !netdev->netdev_ops->ndo_xsk_wakeup) {
+ err = -EOPNOTSUPP;
+ goto err_unreg_pool;
+ }
+
+ bpf.command = XDP_SETUP_XSK_POOL;
+ bpf.xsk.pool = pool;
+ bpf.xsk.queue_id = queue_id;
+
+ err = netdev->netdev_ops->ndo_bpf(netdev, &bpf);
+ if (err)
+ goto err_unreg_pool;
+
+ if (!pool->dma_pages) {
+ WARN(1, "Driver did not DMA map zero-copy buffers");
+ err = -EINVAL;
+ goto err_unreg_xsk;
+ }
+ pool->umem->zc = true;
+ return 0;
+
+err_unreg_xsk:
+ xp_disable_drv_zc(pool);
+err_unreg_pool:
+ if (!force_zc)
+ err = 0; /* fallback to copy mode */
+ if (err) {
+ xsk_clear_pool_at_qid(netdev, queue_id);
+ dev_put(netdev);
+ }
+ return err;
+}
+
+int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
+ u16 queue_id, u16 flags)
+{
+ return __xp_assign_dev(pool, dev, queue_id, flags);
+}
+
+int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
+ struct net_device *dev, u16 queue_id)
+{
+ u16 flags;
+
+ /* One fill and completion ring required for each queue id. */
+ if (!pool->fq || !pool->cq)
+ return -EINVAL;
+
+ flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY;
+ if (pool->uses_need_wakeup)
+ flags |= XDP_USE_NEED_WAKEUP;
+
+ return __xp_assign_dev(pool, dev, queue_id, flags);
+}
+
+void xp_clear_dev(struct xsk_buff_pool *pool)
+{
+ if (!pool->netdev)
return;
- for (i = 0; i < pool->dma_pages_cnt; i++) {
- dma = &pool->dma_pages[i];
+ xp_disable_drv_zc(pool);
+ xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
+ dev_put(pool->netdev);
+ pool->netdev = NULL;
+}
+
+static void xp_release_deferred(struct work_struct *work)
+{
+ struct xsk_buff_pool *pool = container_of(work, struct xsk_buff_pool,
+ work);
+
+ rtnl_lock();
+ xp_clear_dev(pool);
+ rtnl_unlock();
+
+ if (pool->fq) {
+ xskq_destroy(pool->fq);
+ pool->fq = NULL;
+ }
+
+ if (pool->cq) {
+ xskq_destroy(pool->cq);
+ pool->cq = NULL;
+ }
+
+ xdp_put_umem(pool->umem, false);
+ xp_destroy(pool);
+}
+
+void xp_get_pool(struct xsk_buff_pool *pool)
+{
+ refcount_inc(&pool->users);
+}
+
+bool xp_put_pool(struct xsk_buff_pool *pool)
+{
+ if (!pool)
+ return false;
+
+ if (refcount_dec_and_test(&pool->users)) {
+ INIT_WORK(&pool->work, xp_release_deferred);
+ schedule_work(&pool->work);
+ return true;
+ }
+
+ return false;
+}
+
+static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool)
+{
+ struct xsk_dma_map *dma_map;
+
+ list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) {
+ if (dma_map->netdev == pool->netdev)
+ return dma_map;
+ }
+
+ return NULL;
+}
+
+static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_device *netdev,
+ u32 nr_pages, struct xdp_umem *umem)
+{
+ struct xsk_dma_map *dma_map;
+
+ dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL);
+ if (!dma_map)
+ return NULL;
+
+ dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL);
+ if (!dma_map->dma_pages) {
+ kfree(dma_map);
+ return NULL;
+ }
+
+ dma_map->netdev = netdev;
+ dma_map->dev = dev;
+ dma_map->dma_need_sync = false;
+ dma_map->dma_pages_cnt = nr_pages;
+ refcount_set(&dma_map->users, 1);
+ list_add(&dma_map->list, &umem->xsk_dma_list);
+ return dma_map;
+}
+
+static void xp_destroy_dma_map(struct xsk_dma_map *dma_map)
+{
+ list_del(&dma_map->list);
+ kvfree(dma_map->dma_pages);
+ kfree(dma_map);
+}
+
+static void __xp_dma_unmap(struct xsk_dma_map *dma_map, unsigned long attrs)
+{
+ dma_addr_t *dma;
+ u32 i;
+
+ for (i = 0; i < dma_map->dma_pages_cnt; i++) {
+ dma = &dma_map->dma_pages[i];
if (*dma) {
- dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE,
+ dma_unmap_page_attrs(dma_map->dev, *dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, attrs);
*dma = 0;
}
}
+ xp_destroy_dma_map(dma_map);
+}
+
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
+{
+ struct xsk_dma_map *dma_map;
+
+ if (pool->dma_pages_cnt == 0)
+ return;
+
+ dma_map = xp_find_dma_map(pool);
+ if (!dma_map) {
+ WARN(1, "Could not find dma_map for device");
+ return;
+ }
+
+ if (!refcount_dec_and_test(&dma_map->users))
+ return;
+
+ __xp_dma_unmap(dma_map, attrs);
kvfree(pool->dma_pages);
pool->dma_pages_cnt = 0;
pool->dev = NULL;
}
EXPORT_SYMBOL(xp_dma_unmap);
-static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
+static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
{
u32 i;
- for (i = 0; i < pool->dma_pages_cnt - 1; i++) {
- if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1])
- pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
+ for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
+ if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
+ dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
else
- pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
+ dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
}
}
+static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
+{
+ pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL);
+ if (!pool->dma_pages)
+ return -ENOMEM;
+
+ pool->dev = dma_map->dev;
+ pool->dma_pages_cnt = dma_map->dma_pages_cnt;
+ pool->dma_need_sync = dma_map->dma_need_sync;
+ memcpy(pool->dma_pages, dma_map->dma_pages,
+ pool->dma_pages_cnt * sizeof(*pool->dma_pages));
+
+ return 0;
+}
+
int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
unsigned long attrs, struct page **pages, u32 nr_pages)
{
+ struct xsk_dma_map *dma_map;
dma_addr_t dma;
+ int err;
u32 i;
- pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
- GFP_KERNEL);
- if (!pool->dma_pages)
- return -ENOMEM;
+ dma_map = xp_find_dma_map(pool);
+ if (dma_map) {
+ err = xp_init_dma_info(pool, dma_map);
+ if (err)
+ return err;
+
+ refcount_inc(&dma_map->users);
+ return 0;
+ }
- pool->dev = dev;
- pool->dma_pages_cnt = nr_pages;
- pool->dma_need_sync = false;
+ dma_map = xp_create_dma_map(dev, pool->netdev, nr_pages, pool->umem);
+ if (!dma_map)
+ return -ENOMEM;
- for (i = 0; i < pool->dma_pages_cnt; i++) {
+ for (i = 0; i < dma_map->dma_pages_cnt; i++) {
dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
DMA_BIDIRECTIONAL, attrs);
if (dma_mapping_error(dev, dma)) {
- xp_dma_unmap(pool, attrs);
+ __xp_dma_unmap(dma_map, attrs);
return -ENOMEM;
}
if (dma_need_sync(dev, dma))
- pool->dma_need_sync = true;
- pool->dma_pages[i] = dma;
+ dma_map->dma_need_sync = true;
+ dma_map->dma_pages[i] = dma;
}
if (pool->unaligned)
- xp_check_dma_contiguity(pool);
+ xp_check_dma_contiguity(dma_map);
+
+ err = xp_init_dma_info(pool, dma_map);
+ if (err) {
+ __xp_dma_unmap(dma_map, attrs);
+ return err;
+ }
+
return 0;
}
EXPORT_SYMBOL(xp_dma_map);
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index 21e9c2d123ee..c014217f5fa7 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -46,6 +46,7 @@ static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs,
static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
{
+ struct xsk_buff_pool *pool = xs->pool;
struct xdp_umem *umem = xs->umem;
struct xdp_diag_umem du = {};
int err;
@@ -58,21 +59,20 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
du.num_pages = umem->npgs;
du.chunk_size = umem->chunk_size;
du.headroom = umem->headroom;
- du.ifindex = umem->dev ? umem->dev->ifindex : 0;
- du.queue_id = umem->queue_id;
+ du.ifindex = (pool && pool->netdev) ? pool->netdev->ifindex : 0;
+ du.queue_id = pool ? pool->queue_id : 0;
du.flags = 0;
if (umem->zc)
du.flags |= XDP_DU_F_ZEROCOPY;
du.refs = refcount_read(&umem->users);
err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du);
-
- if (!err && umem->fq)
- err = xsk_diag_put_ring(umem->fq, XDP_DIAG_UMEM_FILL_RING, nlskb);
- if (!err && umem->cq) {
- err = xsk_diag_put_ring(umem->cq, XDP_DIAG_UMEM_COMPLETION_RING,
- nlskb);
- }
+ if (!err && pool && pool->fq)
+ err = xsk_diag_put_ring(pool->fq,
+ XDP_DIAG_UMEM_FILL_RING, nlskb);
+ if (!err && pool && pool->cq)
+ err = xsk_diag_put_ring(pool->cq,
+ XDP_DIAG_UMEM_COMPLETION_RING, nlskb);
return err;
}
@@ -83,7 +83,7 @@ static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb)
du.n_rx_dropped = xs->rx_dropped;
du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx);
du.n_rx_full = xs->rx_queue_full;
- du.n_fill_ring_empty = xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0;
+ du.n_fill_ring_empty = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx);
du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx);
return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index bf42cfd74b89..2823b7c3302d 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -15,8 +15,14 @@
struct xdp_ring {
u32 producer ____cacheline_aligned_in_smp;
+ /* Hinder the adjacent cache prefetcher to prefetch the consumer
+ * pointer if the producer pointer is touched and vice versa.
+ */
+ u32 pad1 ____cacheline_aligned_in_smp;
u32 consumer ____cacheline_aligned_in_smp;
+ u32 pad2 ____cacheline_aligned_in_smp;
u32 flags;
+ u32 pad3 ____cacheline_aligned_in_smp;
};
/* Used for the RX and TX queues for packets */
@@ -96,7 +102,7 @@ struct xsk_queue {
* seen and read by the consumer.
*
* The consumer peeks into the ring to see if the producer has written
- * any new entries. If so, the producer can then read these entries
+ * any new entries. If so, the consumer can then read these entries
* and when it is done reading them release them back to the producer
* so that the producer can use these slots to fill in new entries.
*
@@ -166,9 +172,9 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
struct xdp_desc *d,
- struct xdp_umem *umem)
+ struct xsk_buff_pool *pool)
{
- if (!xp_validate_desc(umem->pool, d)) {
+ if (!xp_validate_desc(pool, d)) {
q->invalid_descs++;
return false;
}
@@ -177,14 +183,14 @@ static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
static inline bool xskq_cons_read_desc(struct xsk_queue *q,
struct xdp_desc *desc,
- struct xdp_umem *umem)
+ struct xsk_buff_pool *pool)
{
while (q->cached_cons != q->cached_prod) {
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
u32 idx = q->cached_cons & q->ring_mask;
*desc = ring->desc[idx];
- if (xskq_cons_is_valid_desc(q, desc, umem))
+ if (xskq_cons_is_valid_desc(q, desc, pool))
return true;
q->cached_cons++;
@@ -193,6 +199,30 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q,
return false;
}
+static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q,
+ struct xdp_desc *descs,
+ struct xsk_buff_pool *pool, u32 max)
+{
+ u32 cached_cons = q->cached_cons, nb_entries = 0;
+
+ while (cached_cons != q->cached_prod && nb_entries < max) {
+ struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+ u32 idx = cached_cons & q->ring_mask;
+
+ descs[nb_entries] = ring->desc[idx];
+ if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) {
+ /* Skip the entry */
+ cached_cons++;
+ continue;
+ }
+
+ nb_entries++;
+ cached_cons++;
+ }
+
+ return nb_entries;
+}
+
/* Functions for consumers */
static inline void __xskq_cons_release(struct xsk_queue *q)
@@ -214,17 +244,22 @@ static inline void xskq_cons_get_entries(struct xsk_queue *q)
__xskq_cons_peek(q);
}
-static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
+static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max)
{
u32 entries = q->cached_prod - q->cached_cons;
- if (entries >= cnt)
- return true;
+ if (entries >= max)
+ return max;
__xskq_cons_peek(q);
entries = q->cached_prod - q->cached_cons;
- return entries >= cnt;
+ return entries >= max ? max : entries;
+}
+
+static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
+{
+ return xskq_cons_nb_entries(q, cnt) >= cnt ? true : false;
}
static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
@@ -236,23 +271,35 @@ static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
struct xdp_desc *desc,
- struct xdp_umem *umem)
+ struct xsk_buff_pool *pool)
{
if (q->cached_prod == q->cached_cons)
xskq_cons_get_entries(q);
- return xskq_cons_read_desc(q, desc, umem);
+ return xskq_cons_read_desc(q, desc, pool);
}
+static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs,
+ struct xsk_buff_pool *pool, u32 max)
+{
+ u32 entries = xskq_cons_nb_entries(q, max);
+
+ return xskq_cons_read_desc_batch(q, descs, pool, entries);
+}
+
+/* To improve performance in the xskq_cons_release functions, only update local state here.
+ * Reflect this to global state when we get new entries from the ring in
+ * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop.
+ */
static inline void xskq_cons_release(struct xsk_queue *q)
{
- /* To improve performance, only update local state here.
- * Reflect this to global state when we get new entries
- * from the ring in xskq_cons_get_entries() and whenever
- * Rx or Tx processing are completed in the NAPI loop.
- */
q->cached_cons++;
}
+static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
+{
+ q->cached_cons += cnt;
+}
+
static inline bool xskq_cons_is_full(struct xsk_queue *q)
{
/* No barriers needed since data is not accessed */
@@ -260,20 +307,36 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q)
q->nentries;
}
+static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
+{
+ /* No barriers needed since data is not accessed */
+ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer);
+}
+
/* Functions for producers */
-static inline bool xskq_prod_is_full(struct xsk_queue *q)
+static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
{
u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
- if (free_entries)
- return false;
+ if (free_entries >= max)
+ return max;
/* Refresh the local tail pointer */
q->cached_cons = READ_ONCE(q->ring->consumer);
free_entries = q->nentries - (q->cached_prod - q->cached_cons);
- return !free_entries;
+ return free_entries >= max ? max : free_entries;
+}
+
+static inline bool xskq_prod_is_full(struct xsk_queue *q)
+{
+ return xskq_prod_nb_free(q, 1) ? false : true;
+}
+
+static inline void xskq_prod_cancel(struct xsk_queue *q)
+{
+ q->cached_prod--;
}
static inline int xskq_prod_reserve(struct xsk_queue *q)
@@ -298,6 +361,23 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
return 0;
}
+static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
+ u32 max)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+ u32 nb_entries, i, cached_prod;
+
+ nb_entries = xskq_prod_nb_free(q, max);
+
+ /* A, matches D */
+ cached_prod = q->cached_prod;
+ for (i = 0; i < nb_entries; i++)
+ ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr;
+ q->cached_prod = cached_prod;
+
+ return nb_entries;
+}
+
static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
u64 addr, u32 len)
{
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 8367adbbe9df..113fd9017203 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -11,32 +11,17 @@
#include "xsk.h"
-int xsk_map_inc(struct xsk_map *map)
-{
- bpf_map_inc(&map->map);
- return 0;
-}
-
-void xsk_map_put(struct xsk_map *map)
-{
- bpf_map_put(&map->map);
-}
-
static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
struct xdp_sock **map_entry)
{
struct xsk_map_node *node;
- int err;
- node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
+ node = bpf_map_kzalloc(&map->map, sizeof(*node),
+ GFP_ATOMIC | __GFP_NOWARN);
if (!node)
return ERR_PTR(-ENOMEM);
- err = xsk_map_inc(map);
- if (err) {
- kfree(node);
- return ERR_PTR(err);
- }
+ bpf_map_inc(&map->map);
node->map = map;
node->map_entry = map_entry;
@@ -45,7 +30,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
static void xsk_map_node_free(struct xsk_map_node *node)
{
- xsk_map_put(node->map);
+ bpf_map_put(&node->map->map);
kfree(node);
}
@@ -73,9 +58,8 @@ static void xsk_map_sock_delete(struct xdp_sock *xs,
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
- struct bpf_map_memory mem;
- int err, numa_node;
struct xsk_map *m;
+ int numa_node;
u64 size;
if (!capable(CAP_NET_ADMIN))
@@ -89,18 +73,11 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
numa_node = bpf_map_attr_numa_node(attr);
size = struct_size(m, xsk_map, attr->max_entries);
- err = bpf_map_charge_init(&mem, size);
- if (err < 0)
- return ERR_PTR(err);
-
m = bpf_map_area_alloc(size, numa_node);
- if (!m) {
- bpf_map_charge_finish(&mem);
+ if (!m)
return ERR_PTR(-ENOMEM);
- }
bpf_map_init_from_attr(&m->map, attr);
- bpf_map_charge_move(&m->map.memory, &mem);
spin_lock_init(&m->lock);
return &m->map;
@@ -132,7 +109,7 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
struct bpf_insn *insn = insn_buf;
@@ -185,11 +162,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
xs = (struct xdp_sock *)sock->sk;
- if (!xsk_is_setup_for_bpf_map(xs)) {
- sockfd_put(sock);
- return -EOPNOTSUPP;
- }
-
map_entry = &m->xsk_map[i];
node = xsk_map_node_alloc(m, map_entry);
if (IS_ERR(node)) {
@@ -254,8 +226,16 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
spin_unlock_bh(&map->lock);
}
+static bool xsk_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1)
+{
+ return meta0->max_entries == meta1->max_entries &&
+ bpf_map_meta_equal(meta0, meta1);
+}
+
static int xsk_map_btf_id;
const struct bpf_map_ops xsk_map_ops = {
+ .map_meta_equal = xsk_map_meta_equal,
.map_alloc = xsk_map_alloc,
.map_free = xsk_map_free,
.map_get_next_key = xsk_map_get_next_key,
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 5b9a5ab48111..3adf31a83a79 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -28,6 +28,17 @@ config XFRM_USER
If unsure, say Y.
+config XFRM_USER_COMPAT
+ tristate "Compatible ABI support"
+ depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT && \
+ HAVE_EFFICIENT_UNALIGNED_ACCESS
+ select WANT_COMPAT_NETLINK_MESSAGES
+ help
+ Transformation(XFRM) user configuration interface like IPsec
+ used by compatible Linux applications.
+
+ If unsure, say N.
+
config XFRM_INTERFACE
tristate "Transformation virtual interface"
depends on XFRM && IPV6
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 2d4bb4b9f75e..494aa744bfb9 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
obj-$(CONFIG_XFRM_USER) += xfrm_user.o
+obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
new file mode 100644
index 000000000000..d8e8a11ca845
--- /dev/null
+++ b/net/xfrm/xfrm_compat.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * XFRM compat layer
+ * Author: Dmitry Safonov <dima@arista.com>
+ * Based on code and translator idea by: Florian Westphal <fw@strlen.de>
+ */
+#include <linux/compat.h>
+#include <linux/xfrm.h>
+#include <net/xfrm.h>
+
+struct compat_xfrm_lifetime_cfg {
+ compat_u64 soft_byte_limit, hard_byte_limit;
+ compat_u64 soft_packet_limit, hard_packet_limit;
+ compat_u64 soft_add_expires_seconds, hard_add_expires_seconds;
+ compat_u64 soft_use_expires_seconds, hard_use_expires_seconds;
+}; /* same size on 32bit, but only 4 byte alignment required */
+
+struct compat_xfrm_lifetime_cur {
+ compat_u64 bytes, packets, add_time, use_time;
+}; /* same size on 32bit, but only 4 byte alignment required */
+
+struct compat_xfrm_userpolicy_info {
+ struct xfrm_selector sel;
+ struct compat_xfrm_lifetime_cfg lft;
+ struct compat_xfrm_lifetime_cur curlft;
+ __u32 priority, index;
+ u8 dir, action, flags, share;
+ /* 4 bytes additional padding on 64bit */
+};
+
+struct compat_xfrm_usersa_info {
+ struct xfrm_selector sel;
+ struct xfrm_id id;
+ xfrm_address_t saddr;
+ struct compat_xfrm_lifetime_cfg lft;
+ struct compat_xfrm_lifetime_cur curlft;
+ struct xfrm_stats stats;
+ __u32 seq, reqid;
+ u16 family;
+ u8 mode, replay_window, flags;
+ /* 4 bytes additional padding on 64bit */
+};
+
+struct compat_xfrm_user_acquire {
+ struct xfrm_id id;
+ xfrm_address_t saddr;
+ struct xfrm_selector sel;
+ struct compat_xfrm_userpolicy_info policy;
+ /* 4 bytes additional padding on 64bit */
+ __u32 aalgos, ealgos, calgos, seq;
+};
+
+struct compat_xfrm_userspi_info {
+ struct compat_xfrm_usersa_info info;
+ /* 4 bytes additional padding on 64bit */
+ __u32 min, max;
+};
+
+struct compat_xfrm_user_expire {
+ struct compat_xfrm_usersa_info state;
+ /* 8 bytes additional padding on 64bit */
+ u8 hard;
+};
+
+struct compat_xfrm_user_polexpire {
+ struct compat_xfrm_userpolicy_info pol;
+ /* 8 bytes additional padding on 64bit */
+ u8 hard;
+};
+
+#define XMSGSIZE(type) sizeof(struct type)
+
+static const int compat_msg_min[XFRM_NR_MSGTYPES] = {
+ [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_usersa_info),
+ [XFRM_MSG_DELSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
+ [XFRM_MSG_GETSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
+ [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userpolicy_info),
+ [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
+ [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
+ [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userspi_info),
+ [XFRM_MSG_ACQUIRE - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_acquire),
+ [XFRM_MSG_EXPIRE - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_expire),
+ [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userpolicy_info),
+ [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_usersa_info),
+ [XFRM_MSG_POLEXPIRE - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_polexpire),
+ [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush),
+ [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = 0,
+ [XFRM_MSG_NEWAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
+ [XFRM_MSG_GETAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
+ [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report),
+ [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
+ [XFRM_MSG_NEWSADINFO - XFRM_MSG_BASE] = sizeof(u32),
+ [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32),
+ [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32),
+ [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32),
+ [XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping)
+};
+
+static const struct nla_policy compat_policy[XFRMA_MAX+1] = {
+ [XFRMA_SA] = { .len = XMSGSIZE(compat_xfrm_usersa_info)},
+ [XFRMA_POLICY] = { .len = XMSGSIZE(compat_xfrm_userpolicy_info)},
+ [XFRMA_LASTUSED] = { .type = NLA_U64},
+ [XFRMA_ALG_AUTH_TRUNC] = { .len = sizeof(struct xfrm_algo_auth)},
+ [XFRMA_ALG_AEAD] = { .len = sizeof(struct xfrm_algo_aead) },
+ [XFRMA_ALG_AUTH] = { .len = sizeof(struct xfrm_algo) },
+ [XFRMA_ALG_CRYPT] = { .len = sizeof(struct xfrm_algo) },
+ [XFRMA_ALG_COMP] = { .len = sizeof(struct xfrm_algo) },
+ [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) },
+ [XFRMA_TMPL] = { .len = sizeof(struct xfrm_user_tmpl) },
+ [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) },
+ [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) },
+ [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) },
+ [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 },
+ [XFRMA_ETIMER_THRESH] = { .type = NLA_U32 },
+ [XFRMA_SRCADDR] = { .len = sizeof(xfrm_address_t) },
+ [XFRMA_COADDR] = { .len = sizeof(xfrm_address_t) },
+ [XFRMA_POLICY_TYPE] = { .len = sizeof(struct xfrm_userpolicy_type)},
+ [XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) },
+ [XFRMA_KMADDRESS] = { .len = sizeof(struct xfrm_user_kmaddress) },
+ [XFRMA_MARK] = { .len = sizeof(struct xfrm_mark) },
+ [XFRMA_TFCPAD] = { .type = NLA_U32 },
+ [XFRMA_REPLAY_ESN_VAL] = { .len = sizeof(struct xfrm_replay_state_esn) },
+ [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 },
+ [XFRMA_PROTO] = { .type = NLA_U8 },
+ [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) },
+ [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) },
+ [XFRMA_SET_MARK] = { .type = NLA_U32 },
+ [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 },
+ [XFRMA_IF_ID] = { .type = NLA_U32 },
+};
+
+static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb,
+ const struct nlmsghdr *nlh_src, u16 type)
+{
+ int payload = compat_msg_min[type];
+ int src_len = xfrm_msg_min[type];
+ struct nlmsghdr *nlh_dst;
+
+ /* Compat messages are shorter or equal to native (+padding) */
+ if (WARN_ON_ONCE(src_len < payload))
+ return ERR_PTR(-EMSGSIZE);
+
+ nlh_dst = nlmsg_put(skb, nlh_src->nlmsg_pid, nlh_src->nlmsg_seq,
+ nlh_src->nlmsg_type, payload, nlh_src->nlmsg_flags);
+ if (!nlh_dst)
+ return ERR_PTR(-EMSGSIZE);
+
+ memset(nlmsg_data(nlh_dst), 0, payload);
+
+ switch (nlh_src->nlmsg_type) {
+ /* Compat message has the same layout as native */
+ case XFRM_MSG_DELSA:
+ case XFRM_MSG_DELPOLICY:
+ case XFRM_MSG_FLUSHSA:
+ case XFRM_MSG_FLUSHPOLICY:
+ case XFRM_MSG_NEWAE:
+ case XFRM_MSG_REPORT:
+ case XFRM_MSG_MIGRATE:
+ case XFRM_MSG_NEWSADINFO:
+ case XFRM_MSG_NEWSPDINFO:
+ case XFRM_MSG_MAPPING:
+ WARN_ON_ONCE(src_len != payload);
+ memcpy(nlmsg_data(nlh_dst), nlmsg_data(nlh_src), src_len);
+ break;
+ /* 4 byte alignment for trailing u64 on native, but not on compat */
+ case XFRM_MSG_NEWSA:
+ case XFRM_MSG_NEWPOLICY:
+ case XFRM_MSG_UPDSA:
+ case XFRM_MSG_UPDPOLICY:
+ WARN_ON_ONCE(src_len != payload + 4);
+ memcpy(nlmsg_data(nlh_dst), nlmsg_data(nlh_src), payload);
+ break;
+ case XFRM_MSG_EXPIRE: {
+ const struct xfrm_user_expire *src_ue = nlmsg_data(nlh_src);
+ struct compat_xfrm_user_expire *dst_ue = nlmsg_data(nlh_dst);
+
+ /* compat_xfrm_user_expire has 4-byte smaller state */
+ memcpy(dst_ue, src_ue, sizeof(dst_ue->state));
+ dst_ue->hard = src_ue->hard;
+ break;
+ }
+ case XFRM_MSG_ACQUIRE: {
+ const struct xfrm_user_acquire *src_ua = nlmsg_data(nlh_src);
+ struct compat_xfrm_user_acquire *dst_ua = nlmsg_data(nlh_dst);
+
+ memcpy(dst_ua, src_ua, offsetof(struct compat_xfrm_user_acquire, aalgos));
+ dst_ua->aalgos = src_ua->aalgos;
+ dst_ua->ealgos = src_ua->ealgos;
+ dst_ua->calgos = src_ua->calgos;
+ dst_ua->seq = src_ua->seq;
+ break;
+ }
+ case XFRM_MSG_POLEXPIRE: {
+ const struct xfrm_user_polexpire *src_upe = nlmsg_data(nlh_src);
+ struct compat_xfrm_user_polexpire *dst_upe = nlmsg_data(nlh_dst);
+
+ /* compat_xfrm_user_polexpire has 4-byte smaller state */
+ memcpy(dst_upe, src_upe, sizeof(dst_upe->pol));
+ dst_upe->hard = src_upe->hard;
+ break;
+ }
+ case XFRM_MSG_ALLOCSPI: {
+ const struct xfrm_userspi_info *src_usi = nlmsg_data(nlh_src);
+ struct compat_xfrm_userspi_info *dst_usi = nlmsg_data(nlh_dst);
+
+ /* compat_xfrm_user_polexpire has 4-byte smaller state */
+ memcpy(dst_usi, src_usi, sizeof(src_usi->info));
+ dst_usi->min = src_usi->min;
+ dst_usi->max = src_usi->max;
+ break;
+ }
+ /* Not being sent by kernel */
+ case XFRM_MSG_GETSA:
+ case XFRM_MSG_GETPOLICY:
+ case XFRM_MSG_GETAE:
+ case XFRM_MSG_GETSADINFO:
+ case XFRM_MSG_GETSPDINFO:
+ default:
+ WARN_ONCE(1, "unsupported nlmsg_type %d", nlh_src->nlmsg_type);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ return nlh_dst;
+}
+
+static int xfrm_nla_cpy(struct sk_buff *dst, const struct nlattr *src, int len)
+{
+ return nla_put(dst, src->nla_type, len, nla_data(src));
+}
+
+static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src)
+{
+ switch (src->nla_type) {
+ case XFRMA_PAD:
+ /* Ignore */
+ return 0;
+ case XFRMA_UNSPEC:
+ case XFRMA_ALG_AUTH:
+ case XFRMA_ALG_CRYPT:
+ case XFRMA_ALG_COMP:
+ case XFRMA_ENCAP:
+ case XFRMA_TMPL:
+ return xfrm_nla_cpy(dst, src, nla_len(src));
+ case XFRMA_SA:
+ return xfrm_nla_cpy(dst, src, XMSGSIZE(compat_xfrm_usersa_info));
+ case XFRMA_POLICY:
+ return xfrm_nla_cpy(dst, src, XMSGSIZE(compat_xfrm_userpolicy_info));
+ case XFRMA_SEC_CTX:
+ return xfrm_nla_cpy(dst, src, nla_len(src));
+ case XFRMA_LTIME_VAL:
+ return nla_put_64bit(dst, src->nla_type, nla_len(src),
+ nla_data(src), XFRMA_PAD);
+ case XFRMA_REPLAY_VAL:
+ case XFRMA_REPLAY_THRESH:
+ case XFRMA_ETIMER_THRESH:
+ case XFRMA_SRCADDR:
+ case XFRMA_COADDR:
+ return xfrm_nla_cpy(dst, src, nla_len(src));
+ case XFRMA_LASTUSED:
+ return nla_put_64bit(dst, src->nla_type, nla_len(src),
+ nla_data(src), XFRMA_PAD);
+ case XFRMA_POLICY_TYPE:
+ case XFRMA_MIGRATE:
+ case XFRMA_ALG_AEAD:
+ case XFRMA_KMADDRESS:
+ case XFRMA_ALG_AUTH_TRUNC:
+ case XFRMA_MARK:
+ case XFRMA_TFCPAD:
+ case XFRMA_REPLAY_ESN_VAL:
+ case XFRMA_SA_EXTRA_FLAGS:
+ case XFRMA_PROTO:
+ case XFRMA_ADDRESS_FILTER:
+ case XFRMA_OFFLOAD_DEV:
+ case XFRMA_SET_MARK:
+ case XFRMA_SET_MARK_MASK:
+ case XFRMA_IF_ID:
+ return xfrm_nla_cpy(dst, src, nla_len(src));
+ default:
+ BUILD_BUG_ON(XFRMA_MAX != XFRMA_IF_ID);
+ WARN_ONCE(1, "unsupported nla_type %d", src->nla_type);
+ return -EOPNOTSUPP;
+ }
+}
+
+/* Take kernel-built (64bit layout) and create 32bit layout for userspace */
+static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src)
+{
+ u16 type = nlh_src->nlmsg_type - XFRM_MSG_BASE;
+ const struct nlattr *nla, *attrs;
+ struct nlmsghdr *nlh_dst;
+ int len, remaining;
+
+ nlh_dst = xfrm_nlmsg_put_compat(dst, nlh_src, type);
+ if (IS_ERR(nlh_dst))
+ return PTR_ERR(nlh_dst);
+
+ attrs = nlmsg_attrdata(nlh_src, xfrm_msg_min[type]);
+ len = nlmsg_attrlen(nlh_src, xfrm_msg_min[type]);
+
+ nla_for_each_attr(nla, attrs, len, remaining) {
+ int err = xfrm_xlate64_attr(dst, nla);
+
+ if (err)
+ return err;
+ }
+
+ nlmsg_end(dst, nlh_dst);
+
+ return 0;
+}
+
+static int xfrm_alloc_compat(struct sk_buff *skb, const struct nlmsghdr *nlh_src)
+{
+ u16 type = nlh_src->nlmsg_type - XFRM_MSG_BASE;
+ struct sk_buff *new = NULL;
+ int err;
+
+ if (WARN_ON_ONCE(type >= ARRAY_SIZE(xfrm_msg_min)))
+ return -EOPNOTSUPP;
+
+ if (skb_shinfo(skb)->frag_list == NULL) {
+ new = alloc_skb(skb->len + skb_tailroom(skb), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+ skb_shinfo(skb)->frag_list = new;
+ }
+
+ err = xfrm_xlate64(skb_shinfo(skb)->frag_list, nlh_src);
+ if (err) {
+ if (new) {
+ kfree_skb(new);
+ skb_shinfo(skb)->frag_list = NULL;
+ }
+ return err;
+ }
+
+ return 0;
+}
+
+/* Calculates len of translated 64-bit message. */
+static size_t xfrm_user_rcv_calculate_len64(const struct nlmsghdr *src,
+ struct nlattr *attrs[XFRMA_MAX+1])
+{
+ size_t len = nlmsg_len(src);
+
+ switch (src->nlmsg_type) {
+ case XFRM_MSG_NEWSA:
+ case XFRM_MSG_NEWPOLICY:
+ case XFRM_MSG_ALLOCSPI:
+ case XFRM_MSG_ACQUIRE:
+ case XFRM_MSG_UPDPOLICY:
+ case XFRM_MSG_UPDSA:
+ len += 4;
+ break;
+ case XFRM_MSG_EXPIRE:
+ case XFRM_MSG_POLEXPIRE:
+ len += 8;
+ break;
+ default:
+ break;
+ }
+
+ if (attrs[XFRMA_SA])
+ len += 4;
+ if (attrs[XFRMA_POLICY])
+ len += 4;
+
+ /* XXX: some attrs may need to be realigned
+ * if !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ */
+
+ return len;
+}
+
+static int xfrm_attr_cpy32(void *dst, size_t *pos, const struct nlattr *src,
+ size_t size, int copy_len, int payload)
+{
+ struct nlmsghdr *nlmsg = dst;
+ struct nlattr *nla;
+
+ if (WARN_ON_ONCE(copy_len > payload))
+ copy_len = payload;
+
+ if (size - *pos < nla_attr_size(payload))
+ return -ENOBUFS;
+
+ nla = dst + *pos;
+
+ memcpy(nla, src, nla_attr_size(copy_len));
+ nla->nla_len = nla_attr_size(payload);
+ *pos += nla_attr_size(copy_len);
+ nlmsg->nlmsg_len += nla->nla_len;
+
+ memset(dst + *pos, 0, payload - copy_len);
+ *pos += payload - copy_len;
+
+ return 0;
+}
+
+static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla,
+ size_t *pos, size_t size,
+ struct netlink_ext_ack *extack)
+{
+ int type = nla_type(nla);
+ u16 pol_len32, pol_len64;
+ int err;
+
+ if (type > XFRMA_MAX) {
+ BUILD_BUG_ON(XFRMA_MAX != XFRMA_IF_ID);
+ NL_SET_ERR_MSG(extack, "Bad attribute");
+ return -EOPNOTSUPP;
+ }
+ if (nla_len(nla) < compat_policy[type].len) {
+ NL_SET_ERR_MSG(extack, "Attribute bad length");
+ return -EOPNOTSUPP;
+ }
+
+ pol_len32 = compat_policy[type].len;
+ pol_len64 = xfrma_policy[type].len;
+
+ /* XFRMA_SA and XFRMA_POLICY - need to know how-to translate */
+ if (pol_len32 != pol_len64) {
+ if (nla_len(nla) != compat_policy[type].len) {
+ NL_SET_ERR_MSG(extack, "Attribute bad length");
+ return -EOPNOTSUPP;
+ }
+ err = xfrm_attr_cpy32(dst, pos, nla, size, pol_len32, pol_len64);
+ if (err)
+ return err;
+ }
+
+ return xfrm_attr_cpy32(dst, pos, nla, size, nla_len(nla), nla_len(nla));
+}
+
+static int xfrm_xlate32(struct nlmsghdr *dst, const struct nlmsghdr *src,
+ struct nlattr *attrs[XFRMA_MAX+1],
+ size_t size, u8 type, struct netlink_ext_ack *extack)
+{
+ size_t pos;
+ int i;
+
+ memcpy(dst, src, NLMSG_HDRLEN);
+ dst->nlmsg_len = NLMSG_HDRLEN + xfrm_msg_min[type];
+ memset(nlmsg_data(dst), 0, xfrm_msg_min[type]);
+
+ switch (src->nlmsg_type) {
+ /* Compat message has the same layout as native */
+ case XFRM_MSG_DELSA:
+ case XFRM_MSG_GETSA:
+ case XFRM_MSG_DELPOLICY:
+ case XFRM_MSG_GETPOLICY:
+ case XFRM_MSG_FLUSHSA:
+ case XFRM_MSG_FLUSHPOLICY:
+ case XFRM_MSG_NEWAE:
+ case XFRM_MSG_GETAE:
+ case XFRM_MSG_REPORT:
+ case XFRM_MSG_MIGRATE:
+ case XFRM_MSG_NEWSADINFO:
+ case XFRM_MSG_GETSADINFO:
+ case XFRM_MSG_NEWSPDINFO:
+ case XFRM_MSG_GETSPDINFO:
+ case XFRM_MSG_MAPPING:
+ memcpy(nlmsg_data(dst), nlmsg_data(src), compat_msg_min[type]);
+ break;
+ /* 4 byte alignment for trailing u64 on native, but not on compat */
+ case XFRM_MSG_NEWSA:
+ case XFRM_MSG_NEWPOLICY:
+ case XFRM_MSG_UPDSA:
+ case XFRM_MSG_UPDPOLICY:
+ memcpy(nlmsg_data(dst), nlmsg_data(src), compat_msg_min[type]);
+ break;
+ case XFRM_MSG_EXPIRE: {
+ const struct compat_xfrm_user_expire *src_ue = nlmsg_data(src);
+ struct xfrm_user_expire *dst_ue = nlmsg_data(dst);
+
+ /* compat_xfrm_user_expire has 4-byte smaller state */
+ memcpy(dst_ue, src_ue, sizeof(src_ue->state));
+ dst_ue->hard = src_ue->hard;
+ break;
+ }
+ case XFRM_MSG_ACQUIRE: {
+ const struct compat_xfrm_user_acquire *src_ua = nlmsg_data(src);
+ struct xfrm_user_acquire *dst_ua = nlmsg_data(dst);
+
+ memcpy(dst_ua, src_ua, offsetof(struct compat_xfrm_user_acquire, aalgos));
+ dst_ua->aalgos = src_ua->aalgos;
+ dst_ua->ealgos = src_ua->ealgos;
+ dst_ua->calgos = src_ua->calgos;
+ dst_ua->seq = src_ua->seq;
+ break;
+ }
+ case XFRM_MSG_POLEXPIRE: {
+ const struct compat_xfrm_user_polexpire *src_upe = nlmsg_data(src);
+ struct xfrm_user_polexpire *dst_upe = nlmsg_data(dst);
+
+ /* compat_xfrm_user_polexpire has 4-byte smaller state */
+ memcpy(dst_upe, src_upe, sizeof(src_upe->pol));
+ dst_upe->hard = src_upe->hard;
+ break;
+ }
+ case XFRM_MSG_ALLOCSPI: {
+ const struct compat_xfrm_userspi_info *src_usi = nlmsg_data(src);
+ struct xfrm_userspi_info *dst_usi = nlmsg_data(dst);
+
+ /* compat_xfrm_user_polexpire has 4-byte smaller state */
+ memcpy(dst_usi, src_usi, sizeof(src_usi->info));
+ dst_usi->min = src_usi->min;
+ dst_usi->max = src_usi->max;
+ break;
+ }
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported message type");
+ return -EOPNOTSUPP;
+ }
+ pos = dst->nlmsg_len;
+
+ for (i = 1; i < XFRMA_MAX + 1; i++) {
+ int err;
+
+ if (i == XFRMA_PAD)
+ continue;
+
+ if (!attrs[i])
+ continue;
+
+ err = xfrm_xlate32_attr(dst, attrs[i], &pos, size, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32,
+ int maxtype, const struct nla_policy *policy,
+ struct netlink_ext_ack *extack)
+{
+ /* netlink_rcv_skb() checks if a message has full (struct nlmsghdr) */
+ u16 type = h32->nlmsg_type - XFRM_MSG_BASE;
+ struct nlattr *attrs[XFRMA_MAX+1];
+ struct nlmsghdr *h64;
+ size_t len;
+ int err;
+
+ BUILD_BUG_ON(ARRAY_SIZE(xfrm_msg_min) != ARRAY_SIZE(compat_msg_min));
+
+ if (type >= ARRAY_SIZE(xfrm_msg_min))
+ return ERR_PTR(-EINVAL);
+
+ /* Don't call parse: the message might have only nlmsg header */
+ if ((h32->nlmsg_type == XFRM_MSG_GETSA ||
+ h32->nlmsg_type == XFRM_MSG_GETPOLICY) &&
+ (h32->nlmsg_flags & NLM_F_DUMP))
+ return NULL;
+
+ err = nlmsg_parse_deprecated(h32, compat_msg_min[type], attrs,
+ maxtype ? : XFRMA_MAX, policy ? : compat_policy, extack);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ len = xfrm_user_rcv_calculate_len64(h32, attrs);
+ /* The message doesn't need translation */
+ if (len == nlmsg_len(h32))
+ return NULL;
+
+ len += NLMSG_HDRLEN;
+ h64 = kvmalloc(len, GFP_KERNEL);
+ if (!h64)
+ return ERR_PTR(-ENOMEM);
+
+ err = xfrm_xlate32(h64, h32, attrs, len, type, extack);
+ if (err < 0) {
+ kvfree(h64);
+ return ERR_PTR(err);
+ }
+
+ return h64;
+}
+
+static int xfrm_user_policy_compat(u8 **pdata32, int optlen)
+{
+ struct compat_xfrm_userpolicy_info *p = (void *)*pdata32;
+ u8 *src_templates, *dst_templates;
+ u8 *data64;
+
+ if (optlen < sizeof(*p))
+ return -EINVAL;
+
+ data64 = kmalloc_track_caller(optlen + 4, GFP_USER | __GFP_NOWARN);
+ if (!data64)
+ return -ENOMEM;
+
+ memcpy(data64, *pdata32, sizeof(*p));
+ memset(data64 + sizeof(*p), 0, 4);
+
+ src_templates = *pdata32 + sizeof(*p);
+ dst_templates = data64 + sizeof(*p) + 4;
+ memcpy(dst_templates, src_templates, optlen - sizeof(*p));
+
+ kfree(*pdata32);
+ *pdata32 = data64;
+ return 0;
+}
+
+static struct xfrm_translator xfrm_translator = {
+ .owner = THIS_MODULE,
+ .alloc_compat = xfrm_alloc_compat,
+ .rcv_msg_compat = xfrm_user_rcv_msg_compat,
+ .xlate_user_policy_sockptr = xfrm_user_policy_compat,
+};
+
+static int __init xfrm_compat_init(void)
+{
+ return xfrm_register_translator(&xfrm_translator);
+}
+
+static void __exit xfrm_compat_exit(void)
+{
+ xfrm_unregister_translator(&xfrm_translator);
+}
+
+module_init(xfrm_compat_init);
+module_exit(xfrm_compat_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dmitry Safonov");
+MODULE_DESCRIPTION("XFRM 32-bit compatibility layer");
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 37456d022cfa..1158cd0311d7 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -660,7 +660,7 @@ resume:
/* only the first xfrm gets the encap type */
encap_type = 0;
- if (async && x->repl->recheck(x, skb, seq)) {
+ if (x->repl->recheck(x, skb, seq)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR);
goto drop_unlock;
}
@@ -760,9 +760,9 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr)
}
EXPORT_SYMBOL(xfrm_input_resume);
-static void xfrm_trans_reinject(unsigned long data)
+static void xfrm_trans_reinject(struct tasklet_struct *t)
{
- struct xfrm_trans_tasklet *trans = (void *)data;
+ struct xfrm_trans_tasklet *trans = from_tasklet(trans, t, tasklet);
struct sk_buff_head queue;
struct sk_buff *skb;
@@ -818,7 +818,6 @@ void __init xfrm_input_init(void)
trans = &per_cpu(xfrm_trans_tasklet, i);
__skb_queue_head_init(&trans->queue);
- tasklet_init(&trans->tasklet, xfrm_trans_reinject,
- (unsigned long)trans);
+ tasklet_setup(&trans->tasklet, xfrm_trans_reinject);
}
}
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index a8f66112c52b..697cdcfbb5e1 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -210,7 +210,6 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
{
const struct xfrm_mode *inner_mode;
- struct pcpu_sw_netstats *tstats;
struct net_device *dev;
struct xfrm_state *x;
struct xfrm_if *xi;
@@ -255,13 +254,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
}
xfrmi_scrub_packet(skb, xnet);
-
- tstats = this_cpu_ptr(dev->tstats);
-
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_rx_add(dev, skb->len);
return 0;
}
@@ -326,12 +319,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
err = dst_output(xi->net, skb->sk, skb);
if (net_xmit_eval(err) == 0) {
- struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
-
- u64_stats_update_begin(&tstats->syncp);
- tstats->tx_bytes += length;
- tstats->tx_packets++;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_tx_add(dev, 1, length);
} else {
stats->tx_errors++;
stats->tx_aborted_errors++;
@@ -545,35 +533,6 @@ static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p)
return err;
}
-static void xfrmi_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *s)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct pcpu_sw_netstats *stats;
- struct pcpu_sw_netstats tmp;
- int start;
-
- stats = per_cpu_ptr(dev->tstats, cpu);
- do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
- tmp.rx_packets = stats->rx_packets;
- tmp.rx_bytes = stats->rx_bytes;
- tmp.tx_packets = stats->tx_packets;
- tmp.tx_bytes = stats->tx_bytes;
- } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
-
- s->rx_packets += tmp.rx_packets;
- s->rx_bytes += tmp.rx_bytes;
- s->tx_packets += tmp.tx_packets;
- s->tx_bytes += tmp.tx_bytes;
- }
-
- s->rx_dropped = dev->stats.rx_dropped;
- s->tx_dropped = dev->stats.tx_dropped;
-}
-
static int xfrmi_get_iflink(const struct net_device *dev)
{
struct xfrm_if *xi = netdev_priv(dev);
@@ -581,12 +540,11 @@ static int xfrmi_get_iflink(const struct net_device *dev)
return xi->p.link;
}
-
static const struct net_device_ops xfrmi_netdev_ops = {
.ndo_init = xfrmi_dev_init,
.ndo_uninit = xfrmi_dev_uninit,
.ndo_start_xmit = xfrmi_xmit,
- .ndo_get_stats64 = xfrmi_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = xfrmi_get_iflink,
};
@@ -830,14 +788,14 @@ static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = {
.handler = xfrmi6_rcv_tunnel,
.cb_handler = xfrmi_rcv_cb,
.err_handler = xfrmi6_err,
- .priority = -1,
+ .priority = 2,
};
static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = {
.handler = xfrmi6_rcv_tunnel,
.cb_handler = xfrmi_rcv_cb,
.err_handler = xfrmi6_err,
- .priority = -1,
+ .priority = 2,
};
#endif
@@ -875,14 +833,14 @@ static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = {
.handler = xfrmi4_rcv_tunnel,
.cb_handler = xfrmi_rcv_cb,
.err_handler = xfrmi4_err,
- .priority = -1,
+ .priority = 3,
};
static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = {
.handler = xfrmi4_rcv_tunnel,
.cb_handler = xfrmi_rcv_cb,
.err_handler = xfrmi4_err,
- .priority = -1,
+ .priority = 2,
};
#endif
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d622c2548d22..b74f28cabe24 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -793,15 +793,22 @@ static int xfrm_policy_addr_delta(const xfrm_address_t *a,
const xfrm_address_t *b,
u8 prefixlen, u16 family)
{
+ u32 ma, mb, mask;
unsigned int pdw, pbi;
int delta = 0;
switch (family) {
case AF_INET:
- if (sizeof(long) == 4 && prefixlen == 0)
- return ntohl(a->a4) - ntohl(b->a4);
- return (ntohl(a->a4) & ((~0UL << (32 - prefixlen)))) -
- (ntohl(b->a4) & ((~0UL << (32 - prefixlen))));
+ if (prefixlen == 0)
+ return 0;
+ mask = ~0U << (32 - prefixlen);
+ ma = ntohl(a->a4) & mask;
+ mb = ntohl(b->a4) & mask;
+ if (ma < mb)
+ delta = -1;
+ else if (ma > mb)
+ delta = 1;
+ break;
case AF_INET6:
pdw = prefixlen >> 5;
pbi = prefixlen & 0x1f;
@@ -812,10 +819,13 @@ static int xfrm_policy_addr_delta(const xfrm_address_t *a,
return delta;
}
if (pbi) {
- u32 mask = ~0u << (32 - pbi);
-
- delta = (ntohl(a->a6[pdw]) & mask) -
- (ntohl(b->a6[pdw]) & mask);
+ mask = ~0U << (32 - pbi);
+ ma = ntohl(a->a6[pdw]) & mask;
+ mb = ntohl(b->a6[pdw]) & mask;
+ if (ma < mb)
+ delta = -1;
+ else if (ma > mb)
+ delta = 1;
}
break;
default:
@@ -3078,8 +3088,8 @@ struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
xflo.flags = flags;
/* To accelerate a bit... */
- if ((dst_orig->flags & DST_NOXFRM) ||
- !net->xfrm.policy_count[XFRM_POLICY_OUT])
+ if (!if_id && ((dst_orig->flags & DST_NOXFRM) ||
+ !net->xfrm.policy_count[XFRM_POLICY_OUT]))
goto nopol;
xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index efc89a92961d..d01ca1a18418 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1021,7 +1021,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
if ((x->sel.family &&
(x->sel.family != family ||
!xfrm_selector_match(&x->sel, fl, family))) ||
- !security_xfrm_state_pol_flow_match(x, pol, fl))
+ !security_xfrm_state_pol_flow_match(x, pol,
+ &fl->u.__fl_common))
return;
if (!*best ||
@@ -1036,7 +1037,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
if ((!x->sel.family ||
(x->sel.family == family &&
xfrm_selector_match(&x->sel, fl, family))) &&
- security_xfrm_state_pol_flow_match(x, pol, fl))
+ security_xfrm_state_pol_flow_match(x, pol,
+ &fl->u.__fl_common))
*error = -ESRCH;
}
}
@@ -2004,6 +2006,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
int err = -ENOENT;
__be32 minspi = htonl(low);
__be32 maxspi = htonl(high);
+ __be32 newspi = 0;
u32 mark = x->mark.v & x->mark.m;
spin_lock_bh(&x->lock);
@@ -2022,21 +2025,22 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
xfrm_state_put(x0);
goto unlock;
}
- x->id.spi = minspi;
+ newspi = minspi;
} else {
u32 spi = 0;
for (h = 0; h < high-low+1; h++) {
spi = low + prandom_u32()%(high-low+1);
x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family);
if (x0 == NULL) {
- x->id.spi = htonl(spi);
+ newspi = htonl(spi);
break;
}
xfrm_state_put(x0);
}
}
- if (x->id.spi) {
+ if (newspi) {
spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ x->id.spi = newspi;
h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family);
hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
@@ -2296,6 +2300,66 @@ static bool km_is_alive(const struct km_event *c)
return is_alive;
}
+#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT)
+static DEFINE_SPINLOCK(xfrm_translator_lock);
+static struct xfrm_translator __rcu *xfrm_translator;
+
+struct xfrm_translator *xfrm_get_translator(void)
+{
+ struct xfrm_translator *xtr;
+
+ rcu_read_lock();
+ xtr = rcu_dereference(xfrm_translator);
+ if (unlikely(!xtr))
+ goto out;
+ if (!try_module_get(xtr->owner))
+ xtr = NULL;
+out:
+ rcu_read_unlock();
+ return xtr;
+}
+EXPORT_SYMBOL_GPL(xfrm_get_translator);
+
+void xfrm_put_translator(struct xfrm_translator *xtr)
+{
+ module_put(xtr->owner);
+}
+EXPORT_SYMBOL_GPL(xfrm_put_translator);
+
+int xfrm_register_translator(struct xfrm_translator *xtr)
+{
+ int err = 0;
+
+ spin_lock_bh(&xfrm_translator_lock);
+ if (unlikely(xfrm_translator != NULL))
+ err = -EEXIST;
+ else
+ rcu_assign_pointer(xfrm_translator, xtr);
+ spin_unlock_bh(&xfrm_translator_lock);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xfrm_register_translator);
+
+int xfrm_unregister_translator(struct xfrm_translator *xtr)
+{
+ int err = 0;
+
+ spin_lock_bh(&xfrm_translator_lock);
+ if (likely(xfrm_translator != NULL)) {
+ if (rcu_access_pointer(xfrm_translator) != xtr)
+ err = -EINVAL;
+ else
+ RCU_INIT_POINTER(xfrm_translator, NULL);
+ }
+ spin_unlock_bh(&xfrm_translator_lock);
+ synchronize_rcu();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xfrm_unregister_translator);
+#endif
+
int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen)
{
int err;
@@ -2303,9 +2367,6 @@ int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen)
struct xfrm_mgr *km;
struct xfrm_policy *pol = NULL;
- if (in_compat_syscall())
- return -EOPNOTSUPP;
-
if (sockptr_is_null(optval) && !optlen) {
xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL);
@@ -2320,6 +2381,22 @@ int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen)
if (IS_ERR(data))
return PTR_ERR(data);
+ if (in_compat_syscall()) {
+ struct xfrm_translator *xtr = xfrm_get_translator();
+
+ if (!xtr) {
+ kfree(data);
+ return -EOPNOTSUPP;
+ }
+
+ err = xtr->xlate_user_policy_sockptr(&data, optlen);
+ xfrm_put_translator(xtr);
+ if (err) {
+ kfree(data);
+ return err;
+ }
+ }
+
err = -EINVAL;
rcu_read_lock();
list_for_each_entry_rcu(km, &xfrm_km_list, list) {
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index fbb7d9d06478..0727ac853b55 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -848,21 +848,84 @@ static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb
return 0;
}
+static bool xfrm_redact(void)
+{
+ return IS_ENABLED(CONFIG_SECURITY) &&
+ security_locked_down(LOCKDOWN_XFRM_SECRET);
+}
+
static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
{
struct xfrm_algo *algo;
+ struct xfrm_algo_auth *ap;
struct nlattr *nla;
+ bool redact_secret = xfrm_redact();
nla = nla_reserve(skb, XFRMA_ALG_AUTH,
sizeof(*algo) + (auth->alg_key_len + 7) / 8);
if (!nla)
return -EMSGSIZE;
-
algo = nla_data(nla);
strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name));
- memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8);
+
+ if (redact_secret && auth->alg_key_len)
+ memset(algo->alg_key, 0, (auth->alg_key_len + 7) / 8);
+ else
+ memcpy(algo->alg_key, auth->alg_key,
+ (auth->alg_key_len + 7) / 8);
algo->alg_key_len = auth->alg_key_len;
+ nla = nla_reserve(skb, XFRMA_ALG_AUTH_TRUNC, xfrm_alg_auth_len(auth));
+ if (!nla)
+ return -EMSGSIZE;
+ ap = nla_data(nla);
+ memcpy(ap, auth, sizeof(struct xfrm_algo_auth));
+ if (redact_secret && auth->alg_key_len)
+ memset(ap->alg_key, 0, (auth->alg_key_len + 7) / 8);
+ else
+ memcpy(ap->alg_key, auth->alg_key,
+ (auth->alg_key_len + 7) / 8);
+ return 0;
+}
+
+static int copy_to_user_aead(struct xfrm_algo_aead *aead, struct sk_buff *skb)
+{
+ struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_AEAD, aead_len(aead));
+ struct xfrm_algo_aead *ap;
+ bool redact_secret = xfrm_redact();
+
+ if (!nla)
+ return -EMSGSIZE;
+
+ ap = nla_data(nla);
+ memcpy(ap, aead, sizeof(*aead));
+
+ if (redact_secret && aead->alg_key_len)
+ memset(ap->alg_key, 0, (aead->alg_key_len + 7) / 8);
+ else
+ memcpy(ap->alg_key, aead->alg_key,
+ (aead->alg_key_len + 7) / 8);
+ return 0;
+}
+
+static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb)
+{
+ struct xfrm_algo *ap;
+ bool redact_secret = xfrm_redact();
+ struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_CRYPT,
+ xfrm_alg_len(ealg));
+ if (!nla)
+ return -EMSGSIZE;
+
+ ap = nla_data(nla);
+ memcpy(ap, ealg, sizeof(*ealg));
+
+ if (redact_secret && ealg->alg_key_len)
+ memset(ap->alg_key, 0, (ealg->alg_key_len + 7) / 8);
+ else
+ memcpy(ap->alg_key, ealg->alg_key,
+ (ealg->alg_key_len + 7) / 8);
+
return 0;
}
@@ -906,20 +969,17 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
goto out;
}
if (x->aead) {
- ret = nla_put(skb, XFRMA_ALG_AEAD, aead_len(x->aead), x->aead);
+ ret = copy_to_user_aead(x->aead, skb);
if (ret)
goto out;
}
if (x->aalg) {
ret = copy_to_user_auth(x->aalg, skb);
- if (!ret)
- ret = nla_put(skb, XFRMA_ALG_AUTH_TRUNC,
- xfrm_alg_auth_len(x->aalg), x->aalg);
if (ret)
goto out;
}
if (x->ealg) {
- ret = nla_put(skb, XFRMA_ALG_CRYPT, xfrm_alg_len(x->ealg), x->ealg);
+ ret = copy_to_user_ealg(x->ealg, skb);
if (ret)
goto out;
}
@@ -975,6 +1035,7 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
struct xfrm_dump_info *sp = ptr;
struct sk_buff *in_skb = sp->in_skb;
struct sk_buff *skb = sp->out_skb;
+ struct xfrm_translator *xtr;
struct xfrm_usersa_info *p;
struct nlmsghdr *nlh;
int err;
@@ -992,6 +1053,18 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
return err;
}
nlmsg_end(skb, nlh);
+
+ xtr = xfrm_get_translator();
+ if (xtr) {
+ err = xtr->alloc_compat(skb, nlh);
+
+ xfrm_put_translator(xtr);
+ if (err) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+ }
+
return 0;
}
@@ -1006,7 +1079,6 @@ static int xfrm_dump_sa_done(struct netlink_callback *cb)
return 0;
}
-static const struct nla_policy xfrma_policy[XFRMA_MAX+1];
static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -1083,12 +1155,24 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb,
u32 pid, unsigned int group)
{
struct sock *nlsk = rcu_dereference(net->xfrm.nlsk);
+ struct xfrm_translator *xtr;
if (!nlsk) {
kfree_skb(skb);
return -EPIPE;
}
+ xtr = xfrm_get_translator();
+ if (xtr) {
+ int err = xtr->alloc_compat(skb, nlmsg_hdr(skb));
+
+ xfrm_put_translator(xtr);
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+ }
+
return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
}
@@ -1308,6 +1392,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct xfrm_state *x;
struct xfrm_userspi_info *p;
+ struct xfrm_translator *xtr;
struct sk_buff *resp_skb;
xfrm_address_t *daddr;
int family;
@@ -1358,6 +1443,17 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
}
+ xtr = xfrm_get_translator();
+ if (xtr) {
+ err = xtr->alloc_compat(skb, nlmsg_hdr(skb));
+
+ xfrm_put_translator(xtr);
+ if (err) {
+ kfree_skb(resp_skb);
+ goto out;
+ }
+ }
+
err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid);
out:
@@ -1764,6 +1860,7 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
struct xfrm_userpolicy_info *p;
struct sk_buff *in_skb = sp->in_skb;
struct sk_buff *skb = sp->out_skb;
+ struct xfrm_translator *xtr;
struct nlmsghdr *nlh;
int err;
@@ -1788,6 +1885,18 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
return err;
}
nlmsg_end(skb, nlh);
+
+ xtr = xfrm_get_translator();
+ if (xtr) {
+ err = xtr->alloc_compat(skb, nlh);
+
+ xfrm_put_translator(xtr);
+ if (err) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+ }
+
return 0;
}
@@ -2533,7 +2642,7 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
#define XMSGSIZE(type) sizeof(struct type)
-static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
+const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
[XFRM_MSG_NEWSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info),
[XFRM_MSG_DELSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
[XFRM_MSG_GETSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
@@ -2556,10 +2665,11 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
[XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32),
[XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32),
};
+EXPORT_SYMBOL_GPL(xfrm_msg_min);
#undef XMSGSIZE
-static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
+const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
[XFRMA_SA] = { .len = sizeof(struct xfrm_usersa_info)},
[XFRMA_POLICY] = { .len = sizeof(struct xfrm_userpolicy_info)},
[XFRMA_LASTUSED] = { .type = NLA_U64},
@@ -2591,6 +2701,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
[XFRMA_SET_MARK_MASK] = { .type = NLA_U32 },
[XFRMA_IF_ID] = { .type = NLA_U32 },
};
+EXPORT_SYMBOL_GPL(xfrma_policy);
static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
[XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) },
@@ -2640,11 +2751,9 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct nlattr *attrs[XFRMA_MAX+1];
const struct xfrm_link *link;
+ struct nlmsghdr *nlh64 = NULL;
int type, err;
- if (in_compat_syscall())
- return -EOPNOTSUPP;
-
type = nlh->nlmsg_type;
if (type > XFRM_MSG_MAX)
return -EINVAL;
@@ -2656,32 +2765,55 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!netlink_net_capable(skb, CAP_NET_ADMIN))
return -EPERM;
+ if (in_compat_syscall()) {
+ struct xfrm_translator *xtr = xfrm_get_translator();
+
+ if (!xtr)
+ return -EOPNOTSUPP;
+
+ nlh64 = xtr->rcv_msg_compat(nlh, link->nla_max,
+ link->nla_pol, extack);
+ xfrm_put_translator(xtr);
+ if (IS_ERR(nlh64))
+ return PTR_ERR(nlh64);
+ if (nlh64)
+ nlh = nlh64;
+ }
+
if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) &&
(nlh->nlmsg_flags & NLM_F_DUMP)) {
- if (link->dump == NULL)
- return -EINVAL;
+ struct netlink_dump_control c = {
+ .start = link->start,
+ .dump = link->dump,
+ .done = link->done,
+ };
- {
- struct netlink_dump_control c = {
- .start = link->start,
- .dump = link->dump,
- .done = link->done,
- };
- return netlink_dump_start(net->xfrm.nlsk, skb, nlh, &c);
+ if (link->dump == NULL) {
+ err = -EINVAL;
+ goto err;
}
+
+ err = netlink_dump_start(net->xfrm.nlsk, skb, nlh, &c);
+ goto err;
}
err = nlmsg_parse_deprecated(nlh, xfrm_msg_min[type], attrs,
link->nla_max ? : XFRMA_MAX,
link->nla_pol ? : xfrma_policy, extack);
if (err < 0)
- return err;
+ goto err;
- if (link->doit == NULL)
- return -EINVAL;
+ if (link->doit == NULL) {
+ err = -EINVAL;
+ goto err;
+ }
- return link->doit(skb, nlh, attrs);
+ err = link->doit(skb, nlh, attrs);
+
+err:
+ kvfree(nlh64);
+ return err;
}
static void xfrm_netlink_rcv(struct sk_buff *skb)